Exemplo n.º 1
0
def extract_data(tag):
    if type(tag) == mutagen.easymp4.EasyMP4:
        if 'cover' in tag:
            picture = tag['cover']
            exts = { picture.FORMAT_JPEG: '.jpeg', picture.FORMAT_PNG: '.png' }
            return ((picture, exts[picture.imageformat]), clean_keys(tag))
        else:
            return (None, clean_keys(tag))
        pass
    if type(tag) == mutagen.easyid3.EasyID3:
        if 'cover' in tag:
            picture = tag['cover']
            ext = mimetypes.guess_extension(picture.mime)
            return ((picture.data, ext), clean_keys(tag))
        else:
            return (None, clean_keys(tag))
        pass
    if type(tag) == mutagen.flac.FLAC:
        if tag.pictures:
            picture = tag.pictures[0]
            ext = mimetypes.guess_extension(picture.mime)
            return ((picture.data, ext), clean_keys(tag))
        else:
            return (None, clean_keys(tag))
    pass
Exemplo n.º 2
0
	def getImage(self, imageUrl, referrer):

		content, handle = self.wg.getpage(imageUrl, returnMultiple=True, addlHeaders={'Referer': referrer})
		if not content or not handle:
			raise ValueError("Failed to retreive image from page '%s'!" % referrer)

		fileN = urllib.parse.unquote(urllib.parse.urlparse(handle.geturl())[2].split("/")[-1])
		fileN = bs4.UnicodeDammit(fileN).unicode_markup
		self.log.info("retreived image '%s' with a size of %0.3f K", fileN, len(content)/1000.0)

		if not "." in fileN:
			info = handle.info()
			if 'Content-Type' in info:
				tp = info['Content-Type']
				if ";" in tp:
					tp = tp.split(";")[0]
				ext = guess_extension(tp)
				if ext == None:
					ext = "unknown_ftype"
				print(info['Content-Type'], ext)
				fileN += "." + ext
			else:
				fileN += ".jpg"

		# Let magic figure out the files for us (it's probably smarter then kissmanga, anyways.)
		guessed = magic.from_buffer(content, mime=True)
		ext = guess_extension(tp)
		if ext:
			fileN = fileN + ext

		return fileN, content
def create_url_filename(url_str, content_type):
	# See also: http://stackoverflow.com/a/7406369/1391325
	split_url = urlsplit(url_str)
	netloc = split_url[1]
	netloc_dirname = os.path.sep.join(reversed(netloc.split('.')))
	path = split_url[2]
	stripped_url_str = "".join((netloc_dirname, path))
	url_without_ext, existing_ext = os.path.splitext(stripped_url_str)
	filename_without_ext = url_without_ext.translate(URL_FILENAME_TRANSLATION_TABLE)
	if filename_without_ext.endswith(os.path.sep):
		filename_without_ext = filename_without_ext[:-len(os.path.sep)]
	if existing_ext:
		acceptable_filename_exts = mimetypes.guess_all_extensions(content_type)
		if existing_ext in acceptable_filename_exts:
			# Re-concatenate the now-normalized filename base with the original extension
			result = filename_without_ext + existing_ext
		else:
			canonical_ext = mimetypes.guess_extension(content_type)
			if canonical_ext:
				# If a canonical extension was found for the given content type, concatenate it to the now-normalized filename base
				result = filename_without_ext + canonical_ext
			else:
				# If no canonical extension was found, re-concatenate the original extension after normalizing it
				normalized_existing_ext = normalize_url_component(existing_ext, ".")
				result = filename_without_ext + normalized_existing_ext
	else:
		# Concatenate the canonical extension for the given content type to the result filename in order to avoid potential clashes with other URLs
		canonical_ext = mimetypes.guess_extension(content_type)
		if canonical_ext:
			result = filename_without_ext + canonical_ext
		else:
			# Just add some extention
			result = filename_without_ext + DEFAULT_OUTPATH_SUFFIX
	
	return result
Exemplo n.º 4
0
    def post(self):
        # Check if the image uploaded is a multipart/form-data
        if self.multipart_form_data():
            file_data = self.request.files['media'][0]
            body = file_data['body']

            # Retrieve filename from 'filename' field
            filename = file_data['filename']
        else:
            body = self.request.body

            # Retrieve filename from 'Slug' header
            filename = self.request.headers.get('Slug')

        # Check if the image uploaded is valid
        if self.validate(body):

            # Use the default filename for the uploaded images
            if not filename:
                content_type = self.request.headers.get('Content-Type', BaseEngine.get_mimetype(body))
                extension = mimetypes.guess_extension(content_type.split(';',1)[0], False)
                if extension is None: # Content-Type is unknown, try with body
                    extension = mimetypes.guess_extension(BaseEngine.get_mimetype(body), False)
                if extension == '.jpe':
                     extension = '.jpg'  # Hack because mimetypes return .jpe by default
                if extension is None: # Even body is unknown, return an empty string to be contat
                    extension = ''
                filename = self.context.config.UPLOAD_DEFAULT_FILENAME + extension

            # Build image id based on a random uuid (32 characters)
            id = str(uuid.uuid4().hex)
            self.write_file(id, body)
            self.set_status(201)
            self.set_header('Location', self.location(id, filename))
Exemplo n.º 5
0
	def get_file_name_mime(self, url):
		pgctnt, hName, mime = self.wg.getFileNameMime(url)

		parsed = urllib.parse.urlparse(url)
		pathname = os.path.split(parsed.path)[-1]
		if not hName and not mime and not pathname:
			self.log.error("cannot figure out content type for url: %s", url)
			return pgctnt, "unknown.unknown", "application/octet-stream"

		# empty path with mimetype of text/html generally means it's a directory index (or some horrible dynamic shit).
		if not hName and not pathname and mime == "text/html":
			self.log.info("No path and root location. Assuming index.html")
			return pgctnt, "index.html", "text/html"

		ftype, guessed_mime = mimetypes.guess_type(hName)
		if ftype:
			return pgctnt, hName, guessed_mime if not mime else mime

		ftype, guessed_mime = mimetypes.guess_type(pathname)
		if ftype:
			return pgctnt, pathname, guessed_mime if not mime else mime

		chunks = [hName, pathname]
		chunks = [chunk for chunk in chunks if chunk]

		outname = " - ".join(chunks)
		if mime and mimetypes.guess_extension(mime):
			newext = mimetypes.guess_extension(mime)
		else:
			newext = ".unknown"

		if not outname:
			outname = "unknown"
		return pgctnt, outname+newext, mime if mime else "application/octet-stream"
def start_recording(name, queue):
    """ Starts stream recording """
    date_format = CONFIG.get(__SETTINGS_SECTION, "date_format")
    jar = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar))
    while True:
        schedule = queue.get()
        log.info("Start recording for %s"%schedule)
        end_time = datetime.strptime(schedule['end'],date_format)
        shutil.move(schedule['file'], INPROGRESS_DIR)
        schedule['file'] = os.path.join(INPROGRESS_DIR, schedule['file_name'])
        response = opener.open(schedule['url'])
        extension = ".mpeg"
        if 'Content-type' in response.info():
            extension = mimetypes.guess_extension(response.info()['Content-type'])
        elif 'mime' in schedule:
            extension = mimetypes.guess_extension(schedule['mime'])
        f = open(os.path.join(DOWNLOADS_DIR,schedule['name']+extension),"a+")
        try:
            while end_time>=datetime.now() and os.path.exists(schedule['file']):
                f.write(response.read(__BUFFER))
		f.flush()
            shutil.move(schedule['file'], COMPLETED_DIR)  
            log.info("Recording of %s is done"%schedule)
        except Exception as e:
            log.error("Failed to record %s. Will try to restart."%schedule,e)
            queue.put(schedule, True, __RETRY_INTERVAL)
            pass
        finally:
            f.close()
            response.close()
Exemplo n.º 7
0
    def execute(self, transform_manager):
        filename, headers = retrieve(url=self.url,
                                     user=transform_manager.owner,
                                     username=self.username,
                                     password=self.password,
                                     user_agent=self.user_agent)

        try:
            if headers.get('error'):
                raise TransformException("Failed to download %s" % self.url)
            if not filename:
                raise TransformException(headers.get('message'))

            content_type = headers.get('content-type', 'unknown/unknown')
            content_type = content_type.split(';')[0].strip()
            extension = self.extension \
                     or self.mimetype_overrides.get(content_type) \
                     or (mimetypes.guess_extension(content_type) or '').lstrip('.') \
                     or (mimetypes.guess_extension(content_type, strict=False) or '').lstrip('.') \
                     or 'unknown'

            logger.debug("Response had content-type %r; assigning extension %r" % (content_type, extension))

            with open(transform_manager(extension, self.name), 'w') as output:
                transform_manager.start(self, [input], type='identity')
                with open(filename, 'r') as f:
                    shutil.copyfileobj(f, output)

                logger.info("File from %r saved to %r" % (self.url, output.name))
                return output.name
        finally:
            if headers['delete-after']:
                os.unlink(filename)
Exemplo n.º 8
0
def write_html(client,xml_string,filename):
    tree = etree.fromstring(strip_ns(xml_string))
    elements_to_download = tree.xpath('//img[@src]') + tree.xpath('//object[starts-with(@data, "https://")]')
    if not elements_to_download:
        return serialise_html(xml_string,filename)

    serialise_html(xml_string,filename)

    filename_base = os.path.splitext(filename)[0]

    for external in elements_to_download:
        part_id = id_generator()
        if external.tag == 'img':
            if not os.path.exists(filename_base + '_images/'):
                os.makedirs(filename_base + '_images/')
            data = client.do_request(external.get('data-fullres-src'),raw=True)
            outfile =  os.path.join( filename_base + '_images/', part_id + mimetypes.guess_extension(external.get('data-src-type')))
            write_image(data,outfile)
            data = client.do_request(external.get('src'),raw=True)
            encoded = base64.b64encode(data)
            external.set('src', 'data:'+external.get('data-src-type')+';base64,'+encoded)
        if external.tag == 'object':
            if not os.path.exists(filename_base + '_attachments/'):
                os.makedirs(filename_base + '_attachments/')
            extension = mimetypes.guess_extension(external.get('type'))
            if ( external.get('type') == 'application/vnd.ms-excel' ):
                extension = ''
            outfile =  os.path.join( filename_base + '_attachments/', external.get('data-attachment') + extension)
            data = client.do_request(external.get('data'),raw=True)
            write_image(data,outfile)
            external.set('data' , 'file://'+os.path.abspath(outfile))

    ET.ElementTree(tree).write(filename,method="html")
Exemplo n.º 9
0
    def _guess_destination(self, torrent_files):
        """
        try to identify the correct category of the finished torrent 
        and return the destination path where the torrent has to be moved
        """
        download_path = self.config["download_path"]

        for file in torrent_files:
            try:
                ext = os.path.splitext(file["path"])[1]
                ext = ext.lower()
                mt.guess_extension(ext)
                res = mt.types_map[ext]
                if res in GREY_LIST:
                    log.debug("skipping GREY_LIST extension %s", res)
                    continue
                if (res.startswith("audio")):
                    return [os.path.join(download_path, self.config["sub_audio"]), "audio"]
                elif (res.startswith("video")):
                    return [os.path.join(download_path, self.config["sub_video"]), "video"]
                elif(ext in DOC_FORMAT):
                    return [os.path.join(download_path, self.config["sub_documents"]), "doc"]
                elif(ext in DATA_FORMAT):
                    return [os.path.join(download_path, self.config["sub_data"]), "data"]

            except KeyError:
                    log.debug("unknown extension %s, trying again", ext)
                    continue
                    
        return [os.path.join(download_path, self.config["sub_uncat"]), "uncategorized"]
Exemplo n.º 10
0
def unpack_mail(msg, only_headers=False, exclude_headers=True):
    # TODO: headers, msg_text, msg_html, attachments
    msg_text = ""
    msg_html = ""
    if not msg.is_multipart():
        msg_payload = msg.get_payload(decode=True)
        msg_payload = decode_text(msg_payload)
        if msg.get_content_type == 'text/html':
            msg_html = msg_payload
        else:   # text/plain. or other?
            msg_text = msg_payload
        return msg_text, msg_html, []

    attachments = []
    counter = 1
    for part in msg.walk():
        # multipart/* are just containers
        if part.get_content_maintype() == 'multipart':
            continue

        is_multipart = part.is_multipart()
        filename = part.get_filename()
        filename = decode_mail_header(filename)
        content_type = part.get_content_type()

        if is_multipart or filename:    # an attachment
            if not filename:    # maybe not possible
                ext = mimetypes.guess_extension(content_type)
                if not ext:
                    ext = '.bin'
                filename = 'part-%03d%s' % (counter, ext)
            attachments.append({
                "data": part.get_payload(),
                "filename": filename,
                "content_type": content_type,
                "is_multipart": is_multipart,
            })
        else:
            part_payload = part.get_payload(decode=True)
            part_payload = decode_text(part_payload)
            if content_type == 'text/plain':
                msg_text = part_payload
            elif content_type == 'text/html':
                msg_html = part_payload
            else:   # maybe not possible
                ext = mimetypes.guess_extension(content_type)
                if not ext:
                    ext = '.bin'
                filename = 'part-%03d%s' % (counter, ext)
                attachments.append({
                    "data": part.get_payload(),
                    "filename": filename,
                    "content_type": content_type,
                    "is_multipart": is_multipart(),
                })

        counter += 1

    return msg_text, msg_html, attachments
Exemplo n.º 11
0
    def fetch(self, fetch_info, target_path, progress):
        """
        Fetch a file.
        """

        response = requests.get(fetch_info['url'], stream=True)
        response.raise_for_status()

        mimetype = fetch_info['mimetype'] or response.headers.get('content-type')
        encoding = response.headers.get('content-encoding')
        archive_type = (mimetype, encoding)

        # If the source has an overriden type, we use that instead.
        extension = None

        if fetch_info['mimetype']:
            extension = mimetypes.guess_extension(fetch_info['mimetype'])

        if not extension:
            extension = mimetypes.guess_extension(mimetype)

        if not extension:
            LOGGER.debug('No extension registered for this mimetype (%s). Guessing one from the URL...', mimetype)

            extension = os.path.splitext(urlparse.urlparse(fetch_info['url']).path)[1]

        if extension and extension.startswith('.'):
            extension = extension[1:]

        content_disposition = parse_requests_response(response)

        filename = content_disposition.filename_sanitized(extension=extension, default_filename='archive')

        content_length = response.headers.get('content-length')

        if content_length is not None:
            content_length = int(content_length)

        archive_path = os.path.join(target_path, filename)

        progress.on_start(target=os.path.basename(archive_path), size=content_length)

        with open(archive_path, 'wb') as target_file:
            current_size = 0

            for buf in response.iter_content(1024):

                if buf:
                    target_file.write(buf)
                    current_size += len(buf)

                    progress.on_update(progress=current_size)

        progress.on_finish()

        return {
            'archive_path': archive_path,
            'archive_type': archive_type,
        }
Exemplo n.º 12
0
def _find_store_dir(file_path):
    mime = magic.from_file(file_path, mime=True)
    store_dir = None

    if mime in ['application/octet-stream', 'text/plain'] or mimetypes.guess_extension(mime) == None:
        store_dir = os.path.splitext(file_path)[1]
    else:
        store_dir = mimetypes.guess_extension(mime)
    
    return store_dir.lstrip(".").lower()
Exemplo n.º 13
0
	def download(self, resource):
		if resource in Main.completedResources: #check if they're using the global id
			with open(str(resource) + mimetypes.guess_extension(Main.completedResources[resource][2]), "wb") as output:
				output.write(Main.completedResources[resource][1])
		else:
			for key in Main.completedResources: #loop throught to check if they're using the local id
				completedResource = Main.completedResources[key]
				if completedResource[3] == resource:
					with open(str(resource) + mimetypes.guess_extension(completedResource[2]), "wb") as output:
						output.write(completedResource[1])
Exemplo n.º 14
0
    def _oooConvertByFormat(self, printout, content_type, extra_context, REQUEST, format, batch_mode):
        """
    Convert the ODF document into the given format.

    Keyword arguments:
    printout -- ODF document
    content_type -- the content type of the printout
    extra_context -- extra_context including a format
    REQUEST -- Request object
    format -- requested output format
    batch_mode -- Disable headers overriding
    """
        if REQUEST is not None and not format:
            format = REQUEST.get("format", None)
        filename = self.getProperty("filename")
        # Call refresh through cloudooo
        # XXX This is a temporary implementation:
        # Calling a webservice must be done through a WebServiceMethod
        # and a WebServiceConnection
        from Products.ERP5OOo.Document.OOoDocument import OOoServerProxy, enc, dec

        server_proxy = OOoServerProxy(self)
        extension = guess_extension(content_type).strip(".")
        printout = dec(
            server_proxy.convertFile(
                enc(printout), extension, extension, False, True  # source_format  # destination_format  # zip
            )
        )  # refresh
        # End of temporary implementation
        if not format:
            if REQUEST is not None and not batch_mode:
                REQUEST.RESPONSE.setHeader("Content-Length", len(printout))
                REQUEST.RESPONSE.setHeader("Content-Type", "%s" % content_type)
                REQUEST.RESPONSE.setHeader(
                    "Content-disposition", 'inline;filename="%s%s"' % (filename, guess_extension(content_type) or "")
                )
            return printout
        from Products.ERP5Type.Document import newTempOOoDocument

        tmp_ooo = newTempOOoDocument(self, self.title_or_id())
        tmp_ooo.edit(
            data=printout,
            base_data=printout,
            filename=self.title_or_id(),
            content_type=content_type,
            base_content_type=content_type,
        )
        mime, data = tmp_ooo.convert(format)
        if REQUEST is not None and not batch_mode:
            REQUEST.RESPONSE.setHeader("Content-Length", len(data))
            REQUEST.RESPONSE.setHeader("Content-type", mime)
            REQUEST.RESPONSE.setHeader("Content-disposition", 'attachment;filename="%s.%s"' % (filename, format))
        return str(data)
Exemplo n.º 15
0
def guess_extension(response):
    "Return an extension based on the Content-Type header in the response"
    if not response:
        return None
    ct = response.info().get('content-type')
    if ct:
        mimetype = ct.split(';')[0]
        ext = mimetypes.guess_extension(mimetype)
        if ext:
            return ext
        # otherwise try based on URL
        mimetype, _ = mimetypes.guess_type(response.geturl())
        if mimetype:
            ext = mimetypes.guess_extension(mimetype)
        return ext
Exemplo n.º 16
0
def receive_pop():

    temp = []

    M = poplib.POP3('pop.sina.com')
    M.user('*****@*****.**')
    M.pass_(getpass.getpass())
    numMessages = len(M.list()[1])

    for i in range(numMessages):
        for j in M.retr(i+1)[1]:
            temp.append(j)

        body = "\n".join(temp)
        begin = re.search("Content", body)
        print (body[begin.start():])
        msg = email.message_from_string(body[begin.start():])
        counter = 1
        for part in msg.walk():
            # multipart/* are just containers
            if part.get_content_maintype() == 'multipart':
                continue
           # Applications should really sanitize the given filename so that
           # an email message can't be used to overwrite import files
            filename = part.get_filename()
            if not filename:
                ext = mimetypes.guess_extension(part.get_content_type())
                if not ext:
                    # Use a generic bag of bits extension
                    ext = '.bin'
                filename = 'part-%03d%s' % (counter, ext)
            counter += 1
            fp = open(os.path.join(".", filename), 'wb')
            fp.write(part.get_payload(decode=True))
            fp.close()
Exemplo n.º 17
0
    def __indexCycleProcess(self):
        self.__run = 1
        while self.__run:
            self.__lockUrls.acquire()
            if len(self.__urls) > 0:
                url, search_query, weight = self.__urls.pop()
            else:
                url = None
            self.__lockUrls.release()

            if url:
                tmp_filename = tempfile.mktemp("", maay.globalvars.config.getValue("TemporaryDocumentRoot") + os.path.sep)
                fd = file(tmp_filename, "wb")

                infos = self.__fetchURL(url, fd)

                if infos:
                    mime_type, last_modified, content_size, document_id = infos
                else:
                    continue

                newname = document_id + (mimetypes.guess_extension(mime_type) or ".txt")
                absolute_newname = "%s%s%s" % (maay.globalvars.config.getValue("CachedDocumentRoot"), os.path.sep, newname)
                if os.path.exists(absolute_newname):
                    os.remove(absolute_newname)

                maay.globalvars.logger.debug("rename %s => %s" % (tmp_filename, absolute_newname))
                os.rename(tmp_filename, absolute_newname)
                maay.globalvars.logger.debug("done => %s" % absolute_newname)
                maay.globalvars.indexer.addNewDocumentToIndex(absolute_newname, mime_type, last_modified, url, search_query=search_query, weight=weight)
            else:
                time.sleep(2)
Exemplo n.º 18
0
    def addPicture(self, filename, mediatype=None, content=None):
        """
        Add a picture
        It uses the same convention as OOo, in that it saves the picture in
        the zipfile in the subdirectory 'Pictures'
        If passed a file ptr, mediatype must be set
        @param filename unicode string: name of a file for Pictures
        @param mediatype unicode string: name of a media, None by default
        @param content bytes: content of media, None by default
        @return a unicode string: the file name of the media, eventually
        created on the fly
        """
        if content is None:
            if mediatype is None:
                mediatype, encoding = mimetypes.guess_type(filename)
            if mediatype is None:
                mediatype = u''
                try: ext = filename[filename.rindex(u'.'):]
                except: ext=u''
            else:
                ext = mimetypes.guess_extension(mediatype)
            manifestfn = u"Pictures/%s%s" % (uuid.uuid4().hex.upper(), ext)
            self.Pictures[manifestfn] = (IS_FILENAME, filename, mediatype)
            content=b""  # this value is only use by the assert further
            filename=u"" # this value is only use by the assert further
        else:
            manifestfn = filename
            self.Pictures[manifestfn] = (IS_IMAGE, content, mediatype)

        assert(type(filename)==type(u""))
        assert(type(content) == type(b""))

        return manifestfn
Exemplo n.º 19
0
 def get_result(self, path, original_name=False,
                default_ext='.bin', delete_msg=True, get_file=True):
     q = self.get_queue(self.output_queue_name)
     m = q.read()
     if m:
         if get_file:
             outputs = m['OutputKey'].split(',')
             for output in outputs:
                 key_name, type = output.split(';')
                 mime_type = type.split('=')[1]
                 if original_name:
                     file_name = m.get('OriginalFileName', key_name)
                     file_name, ext = os.path.splitext(file_name)
                     ext = mimetypes.guess_extension(mime_type)
                     if not ext:
                         ext = default_ext
                     file_name = file_name + ext
                 else:
                     file_name = key_name
                 bucket = self.get_bucket(m['Bucket'])
                 key = bucket.lookup(key_name)
                 print 'retrieving file: %s' % file_name
                 key.get_contents_to_filename(os.path.join(path, file_name))
         if delete_msg:
             q.delete_message(m)
     return m
Exemplo n.º 20
0
def upload_image_url(request):
    if request.method != 'POST':
        return HttpResponse(status=403)
    image_url = request.POST.get('image_url', None)
    source_domain = request.POST.get('source_domain', None)

    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        # 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        # 'Accept-Encoding': 'none',
        # 'Accept-Language': 'en-US,en;q=0.8',
        # 'Connection': 'keep-alive',
        'referer': source_domain,
    }

    ext = mimetypes.guess_extension(mimetypes.guess_type(image_url)[0])
    req = urllib2.Request(image_url, headers=headers)
    img_temp = NamedTemporaryFile(delete=True)
    img_temp.write(urllib2.urlopen(req).read())
    img_temp.flush()

    post_photo = Post_photo()
    post_photo.photo.save('%s%s'%(uuid.uuid4(), ext), File(img_temp))
    post_photo.save()

    res = {
        'link': post_photo.photo.url,
    }
    return JsonResponse(res, safe=False)
Exemplo n.º 21
0
def get_extension(content):
    """A handful of workarounds for getting extensions we can trust."""
    file_str = magic.from_buffer(content)
    if file_str.startswith('Composite Document File V2 Document'):
        # Workaround for issue with libmagic1==5.09-2 in Ubuntu 12.04. Fixed
        # in libmagic 5.11-2.
        mime = 'application/msword'
    elif file_str == '(Corel/WP)':
        mime = 'application/vnd.wordperfect'
    elif file_str == 'C source, ASCII text':
        mime = 'text/plain'
    else:
        # No workaround necessary
        mime = magic.from_buffer(content, mime=True)
    extension = mimetypes.guess_extension(mime)
    if extension == '.obj':
        # It could be a wpd, if it's not a PDF
        if 'PDF' in content[0:40]:
            # Does 'PDF' appear in the beginning of the content?
            extension = '.pdf'
        else:
            extension = '.wpd'
    if extension == '.wsdl':
        # It's probably an HTML file, like those from Resource.org
        extension = '.html'
    if extension == '.ksh':
        extension = '.txt'
    if extension == '.asf':
        extension = '.wma'
    return extension
Exemplo n.º 22
0
def func_image_svg_plus_xml (request, cf) :
	__argument = request.GET.copy()

	try :
		convert = mimetypes.guess_extension(__argument.get("force_mimetype", "").strip()).split(".")[1]
	except :
		convert = None

	if convert and convert in ("png", ) :
		try :
			s = svg.SVG(cf)
			output = s.render(
				outputtype=convert,
				width=__argument.get("width"),
				height=__argument.get("height"),
			)

			tmp = func_image(
				request,
				output
			)
			return tmp
		except :
			return cf

	return cf
Exemplo n.º 23
0
def addNewVideo(request):
	import mimetypes
	data = {}
	try:
		if request.FILES:
			_file = request.FILES['video']
			visible= request.POST.get('visible')
			_name = _file.name
			_type,enc = mimetypes.guess_type(_name) 
			extns =mimetypes.guess_extension(_type)

			_dir = 'videos/'
			uploaded =f**k.util.UploadVideo(request, path='f**k/media/'+_dir)	
			if uploaded is not None:				
				data['token'] = uploaded.out_file_name[0]
				info = myutil.video_info(data['token'])			
				thumb = info.get('thumb')
				size = info.get('size')				
				dur = info.get('dur')
				success = m.Videos.objects.create( user= request.user,title=_name, token = data['token'] , type =_type, encoding= enc, visibility =visible, thumbnail= thumb, duration = dur  ,resolution= size)
				if success is not None:
					data['success']  = True
					data['message'] = 'uploaded successfully '
					data['html'] = get_template('ajax/video/render_video.html').render(Context({'video':success}))
			else:
				raise Exception('Something went wrong can not upload this file !')
		else:
			data['error'] =True
			data['message'] = "File Not found!"
	except Exception,ex:
		traceback.print_exc(file=sys.stdout)
		data['error'] = True
		data['message'] = 'Internal Error'
		return HttpResponse( "Exception %s"%ex.message)
Exemplo n.º 24
0
Arquivo: forms.py Projeto: theju/smp
    def clean(self):
        data = self.cleaned_data

        if data.get("scheduled_datetime"):
            sched_dt = data["scheduled_datetime"]
            sched_tz = timezone.pytz.timezone(data.get("scheduled_tz"))
            sched_dt = sched_tz.localize(sched_dt.replace(tzinfo=None))
            data["scheduled_datetime"] = timezone.localtime(sched_dt)

        if data.get("attached_media") and data.get("media_url"):
            raise forms.ValidationError(_("Only one of media URL or "
                                          "attached media may be provided"))

        if data.get("media_url"):
            response = requests.get(data["media_url"])
            if not response.ok:
                raise forms.ValidationError(_("An error occurred while "
                                              "downloading the media from the URL"))
            ext = mimetypes.guess_extension(response.headers['content-type'])
            ff = tempfile.NamedTemporaryFile(suffix=ext)
            ff.write(response.content)
            img_file = ImageFile(ff, name=ff.name)
            height, width = get_image_dimensions(img_file)
            if height is None or width is None:
                ff.close()
                raise forms.ValidationError(_("Invalid image"))
            data["attached_media"] = img_file
        return data
Exemplo n.º 25
0
    def __init__(self, transmogrifier, name, options, previous):
        self.previous = previous

        self.logger = logging.getLogger(options.get("name", transmogrifier.configuration_id + "." + name))

        self.key = defaultMatcher(options, "url-key", name, "url")
        self.cachekey = Expression(options.get("cache-key", "string:_cache"), transmogrifier, name, options)
        self.headerskey = Expression(options.get("headers-key", "string:_headers"), transmogrifier, name, options)
        self.headersext = options.get("headers-extension", mimetypes.guess_extension("message/rfc822"))

        self.cachedir = resolvePackageReferenceOrFile(
            options.get("cache-directory", os.path.join(os.environ.get("PWD", os.getcwd()), "var/urlopener.cache.d"))
        )
        if not os.path.isdir(self.cachedir):
            os.makedirs(self.cachedir)
        self.defaultpagename = options.get("default-page-name", ".{}.cache".format(options["blueprint"]))

        handlers = Expression(options.get("handlers", "python:[]"), transmogrifier, name, options)(options)
        if "ignore-error" in options:
            self.ignore_error = Expression(options["ignore-error"], transmogrifier, name, options)
            self.ignore_handler = HTTPDefaultErrorHandler()
            self.ignore_handler.section = self
            handlers.append(self.ignore_handler)
        if not [handler for handler in handlers if isinstance(handler, urllib2.HTTPRedirectHandler)]:
            handlers.append(HTTPRedirectHandler())
        self.opener = urllib2.build_opener(*handlers)
Exemplo n.º 26
0
def check_content_id(msg):
    """Check message part for Content-Id key.

    The use of Content-ID in mail messages seems to mostly be related
    to inline images.

    Since this key is case insensitive, have to loop through all
    kinds of keys to get there :(

    The content-id uri seems...pretty loose.
    <https://tools.ietf.org/html/rfc2392>
    """
    for k in msg.keys():
        if k.lower() != 'content-id':
            continue

        content_id = msg[k]

        if content_id.startswith('<'):
            content_id = content_id[1:]

        if content_id.endswith('>'):
            content_id = content_id[:-1]

        ext = mimetypes.guess_extension(msg.get_content_type())
        fn = os.path.join(TMPDIR, '%s%s' % (str(uuid.uuid4()), ext))

        CONTENTS.append({'id': content_id, 'filename': fn})

        with open(fn, 'wb') as f:
            f.write(msg.get_payload(decode=True))
Exemplo n.º 27
0
 def _save(self, name, content):
     name = self._clean_name(name)
     headers = self.headers
     content_type = mimetypes.guess_type(name)[0] or "application/x-octet-stream"
     
     if self.gzip and content_type in self.gzip_content_types:
         content = self._compress_content(content)
         headers.update({'Content-Encoding': 'gzip'})
     
     headers.update({
         'Content-Type': content_type,
         'Content-Length' : len(content),
     })
     
     newname = _compute_hash(content)
     extension =  mimetypes.guess_extension(content_type)
     if extension:
         newname = newname + extension
     content.name = newname
     k = self.bucket.get_key(newname)
     if not k:
         k = self.bucket.new_key(newname)
         k.set_metadata('original_filename', name)
     k.set_contents_from_file(content, headers=headers, policy=self.acl)
     return newname
Exemplo n.º 28
0
    def download(self, url):
        r = requests.get(url, stream=True)
        for chunk in r.iter_content(chunk_size=1024):
            self.f.write(chunk)
            self.f.flush()

        if r.status_code == 404:
            return False

        parsed_url = urlparse(url)
        self.filename = list(reversed(parsed_url.path.split("/")))[0]
        if "content-type" in r.headers:
            self.content_type = r.headers['content-type']
            ext = mimetypes.guess_extension(self.content_type)
            if ext:
                self.filename = self.filename + ext
        if "content-disposition" in r.headers:
            disposition = r.headers['content-disposition']
            parts = disposition.split(';')
            if len(parts) > 1:
                self.filename = parts[1].strip(' ')
                self.filename = self.filename[self.filename.find('=') + 1:].strip(' ')
        self.filename = ''.join([c for c in self.filename if c.isalpha() or c == '.'])
        print self.filename

        return True
Exemplo n.º 29
0
    def addPictureFromFile(self, filename, mediatype=None):
        """
        Add a picture
        It uses the same convention as OOo, in that it saves the picture in
        the zipfile in the subdirectory 'Pictures'.
        If mediatype is not given, it will be guessed from the filename
        extension.
        @param filesname unicode string: name of an image file
        @param mediatype unicode string: type of media, dfaults to None
        @return a unicode string, the name of the created file
        """
        if mediatype is None:
            mediatype, encoding = mimetypes.guess_type(filename)
        if mediatype is None:
            mediatype = u''
            try: ext = filename[filename.rindex(u'.'):]
            except ValueError: ext=u''
        else:
            ext = mimetypes.guess_extension(mediatype)
        manifestfn = u"Pictures/%s%s" % (uuid.uuid4().hex.upper(), ext)
        self.Pictures[manifestfn] = (IS_FILENAME, filename, mediatype)

        assert(type(filename)==type(u""))
        assert(type(mediatype)==type(u""))

        return manifestfn
Exemplo n.º 30
0
def get_extension(content):
    """A handful of workarounds for getting extensions we can trust."""
    file_str = magic.from_buffer(content)
    if file_str.startswith("Composite Document File V2 Document"):
        # Workaround for issue with libmagic1==5.09-2 in Ubuntu 12.04. Fixed
        # in libmagic 5.11-2.
        mime = "application/msword"
    elif file_str == "(Corel/WP)":
        mime = "application/vnd.wordperfect"
    elif file_str == "C source, ASCII text":
        mime = "text/plain"
    else:
        # No workaround necessary
        mime = magic.from_buffer(content, mime=True)
    extension = mimetypes.guess_extension(mime)
    if extension == ".obj":
        # It could be a wpd, if it's not a PDF
        if "PDF" in content[0:40]:
            # Does 'PDF' appear in the beginning of the content?
            extension = ".pdf"
        elif is_html(content):
            extension = ".html"
        else:
            extension = ".wpd"
    if extension == ".wsdl":
        # It's probably an HTML file, like those from Resource.org
        extension = ".html"
    if extension == ".ksh":
        extension = ".txt"
    if extension == ".asf":
        extension = ".wma"
    return extension
Exemplo n.º 31
0
def ticket_from_message(message, queue, quiet):
    # 'message' must be an RFC822 formatted message.
    msg = message
    message = email.message_from_string(msg)
    subject = message.get('subject', _('Created from e-mail'))
    subject = decode_mail_headers(decodeUnknown(message.get_charset(), subject))
    subject = subject.replace("Re: ", "").replace("Fw: ", "").replace("RE: ", "").replace("FW: ", "").replace("Automatic reply: ", "").strip()

    sender = message.get('from', _('Unknown Sender'))
    sender = decode_mail_headers(decodeUnknown(message.get_charset(), sender))

    sender_email = parseaddr(sender)[1]

    body_plain, body_html = '', ''

    for ignore in IgnoreEmail.objects.filter(Q(queues=queue) | Q(queues__isnull=True)):
        if ignore.test(sender_email):
            if ignore.keep_in_mailbox:
                # By returning 'False' the message will be kept in the mailbox,
                # and the 'True' will cause the message to be deleted.
                return False
            return True

    matchobj = re.match(r".*\["+queue.slug+"-(?P<id>\d+)\]", subject)
    if matchobj:
        # This is a reply or forward.
        ticket = matchobj.group('id')
    else:
        ticket = None

    counter = 0
    files = []

    for part in message.walk():
        if part.get_content_maintype() == 'multipart':
            continue

        name = part.get_param("name")
        if name:
            name = collapse_rfc2231_value(name)

        if part.get_content_maintype() == 'text' and name == None:
            if part.get_content_subtype() == 'plain':
                body_plain = EmailReplyParser.parse_reply(decodeUnknown(part.get_content_charset(), part.get_payload(decode=True)))
            else:
                body_html = part.get_payload(decode=True)
                try:
                    # strip html tags
                    body_plain = striptags(body_html)
                except DjangoUnicodeDecodeError as e:
                    charset = chardet.detect(body_html)['encoding']
                    body_plain = striptags(unicode(body_html, charset))
                # remove extra new lines
                body_plain, n = re.subn(r'[\r\n]+', r'\n', body_plain)
                # remove extra spaces
                body_plain, n = re.subn(r'\s+$', '', body_plain, flags=re.M)
                body_plain = unescape(body_plain)
        else:
            if not name:
                ext = mimetypes.guess_extension(part.get_content_type())
                name = "part-%i%s" % (counter, ext)

            files.append({
                'filename': name,
                'content': part.get_payload(decode=True),
                'type': part.get_content_type()},
                )

        counter += 1

    if body_plain:
        body = body_plain
        if body_html:
            body += '\n\n'
            body += _('***Note that HTML tags are stripped out. Please see attachment email_html_body.html for the full html content.')
    else:
        body = _('No plain-text email body available. Please see attachment email_html_body.html.')

    if body_html:
        files.append({
            'filename': _("email_html_body.html"),
            'content': body_html,
            'type': 'text/html',
        })

    now = timezone.now()

    if ticket:
        try:
            t = Ticket.objects.get(id=ticket)
            new = False
        except Ticket.DoesNotExist:
            ticket = None

    priority = 3

    smtp_priority = message.get('priority', '')
    smtp_importance = message.get('importance', '')

    high_priority_types = ('high', 'important', '1', 'urgent')

    if smtp_priority in high_priority_types or smtp_importance in high_priority_types:
        priority = 2

    if ticket == None:
        t = Ticket(
            title=subject,
            queue=queue,
            submitter_email=sender_email,
            created=now,
            description=body,
            priority=priority,
        )
        t.save()
        new = True
        update = ''

    elif t.status == Ticket.CLOSED_STATUS:
        t.status = Ticket.REOPENED_STATUS
        t.save()

    f = FollowUp(
        ticket = t,
        title = _('E-Mail Received from %(sender_email)s' % {'sender_email': sender_email}),
        date = timezone.now(),
        public = True,
        comment = body,
    )

    if t.status == Ticket.REOPENED_STATUS:
        f.new_status = Ticket.REOPENED_STATUS
        f.title = _('Ticket Re-Opened by E-Mail Received from %(sender_email)s' % {'sender_email': sender_email})
    
    f.save()

    if not quiet:
        print (" [%s-%s] %s" % (t.queue.slug, t.id, t.title,)).encode('ascii', 'replace')

    for file in files:
        if file['content']:
            filename = file['filename'].encode('ascii', 'replace').replace(' ', '_')
            filename = re.sub('[^a-zA-Z0-9._-]+', '', filename)
            a = Attachment(
                followup=f,
                filename=filename,
                mime_type=file['type'],
                size=len(file['content']),
                )
            a.file.save(filename, ContentFile(file['content']), save=False)
            a.save()
            if not quiet:
                print "    - %s" % filename


    context = safe_template_context(t)

    if new:

        if sender_email:
            send_templated_mail(
                'newticket_submitter',
                context,
                recipients=sender_email,
                sender=queue.from_address,
                fail_silently=True,
                )

        if queue.new_ticket_cc:
            send_templated_mail(
                'newticket_cc',
                context,
                recipients=queue.new_ticket_cc,
                sender=queue.from_address,
                fail_silently=True,
                )

        if queue.updated_ticket_cc and queue.updated_ticket_cc != queue.new_ticket_cc:
            send_templated_mail(
                'newticket_cc',
                context,
                recipients=queue.updated_ticket_cc,
                sender=queue.from_address,
                fail_silently=True,
                )

    else:
        context.update(comment=f.comment)

        if t.status == Ticket.REOPENED_STATUS:
            update = _(' (Reopened)')
        else:
            update = _(' (Updated)')

        if t.assigned_to:
            send_templated_mail(
                'updated_owner',
                context,
                recipients=t.assigned_to.email,
                sender=queue.from_address,
                fail_silently=True,
                )

        if queue.updated_ticket_cc:
            send_templated_mail(
                'updated_cc',
                context,
                recipients=queue.updated_ticket_cc,
                sender=queue.from_address,
                fail_silently=True,
                )

    return t
Exemplo n.º 32
0
def get_mime(file):
    """Given a file, returns mimetype and extension"""
    mime = magic.from_buffer(file.read(2048), mime=True)
    extension = guess_extension(mime, False)
    return mime, extension
Exemplo n.º 33
0
def getExtension(mimeType):
    global contentTypes
    return contentTypes.get(mimeType, mimetypes.guess_extension(mimeType))
Exemplo n.º 34
0
def test_attach_http(mock_get):
    """
    API: AttachHTTP() object

    """

    # Define our good:// url
    class GoodNotification(NotifyBase):
        def __init__(self, *args, **kwargs):
            super(GoodNotification, self).__init__(*args, **kwargs)

        def notify(self, *args, **kwargs):
            # Pretend everything is okay
            return True

        def url(self):
            # Support url() function
            return ''

    # Store our good notification in our schema map
    SCHEMA_MAP['good'] = GoodNotification

    # Temporary path
    path = join(TEST_VAR_DIR, 'apprise-test.gif')

    class DummyResponse(object):
        """
        A dummy response used to manage our object
        """
        status_code = requests.codes.ok
        headers = {
            'Content-Length': getsize(path),
            'Content-Type': 'image/gif',
        }

        # Pointer to file
        ptr = None

        # used to return random keep-alive chunks
        _keepalive_chunk_ref = 0

        def close(self):
            return

        def iter_content(self, chunk_size=1024):
            """Lazy function (generator) to read a file piece by piece.
            Default chunk size: 1k."""

            while True:
                self._keepalive_chunk_ref += 1
                if 16 % self._keepalive_chunk_ref == 0:
                    # Yield a keep-alive block
                    yield ''

                data = self.ptr.read(chunk_size)
                if not data:
                    break
                yield data

        def raise_for_status(self):
            return

        def __enter__(self):
            self.ptr = open(path, 'rb')
            return self

        def __exit__(self, *args, **kwargs):
            self.ptr.close()

    # Prepare Mock
    dummy_response = DummyResponse()
    mock_get.return_value = dummy_response

    # Test custom url get parameters
    results = AttachHTTP.parse_url(
        'http://*****:*****@localhost/apprise.gif?dl=1&cache=300')
    assert isinstance(results, dict)
    attachment = AttachHTTP(**results)
    assert isinstance(attachment.url(), six.string_types) is True

    # Test that our extended variables are passed along
    assert mock_get.call_count == 0
    assert attachment
    assert mock_get.call_count == 1
    assert 'params' in mock_get.call_args_list[0][1]
    assert 'dl' in mock_get.call_args_list[0][1]['params']

    # Verify that arguments that are reserved for apprise are not
    # passed along
    assert 'cache' not in mock_get.call_args_list[0][1]['params']

    results = AttachHTTP.parse_url(
        'http://*****:*****@localhost/apprise.gif?+key=value&cache=True')
    assert isinstance(results, dict)
    attachment = AttachHTTP(**results)
    assert isinstance(attachment.url(), six.string_types) is True
    # No mime-type and/or filename over-ride was specified, so therefore it
    # won't show up in the generated URL
    assert re.search(r'[?&]mime=', attachment.url()) is None
    assert re.search(r'[?&]name=', attachment.url()) is None
    # No Content-Disposition; so we use filename from path
    assert attachment.name == 'apprise.gif'
    assert attachment.mimetype == 'image/gif'

    results = AttachHTTP.parse_url(
        'http://*****:*****@localhost/ignore-filename.gif')
    assert isinstance(results, dict)
    attachment = AttachHTTP(**results)
    assert isinstance(attachment.url(), six.string_types) is True
    # No mime-type and/or filename over-ride was specified, so therefore it
    # won't show up in the generated URL
    assert re.search(r'[?&]mime=', attachment.url()) is None
    assert re.search(r'[?&]name=', attachment.url()) is None
    assert attachment.mimetype == 'image/gif'
    # Because we could determine our mime type, we could build an extension
    # for our unknown filename
    assert attachment.name == 'myimage.gif'
    assert attachment
    assert len(attachment) == getsize(path)

    # Similar to test above except we make our max message size just 1 byte
    # smaller then our gif file. This will cause us to fail to read the
    # attachment
    AttachHTTP.max_file_size = getsize(path) - 1
    results = AttachHTTP.parse_url('http://localhost/toobig.jpg')
    assert isinstance(results, dict)
    attachment = AttachHTTP(**results)
    # we can not download this attachment
    assert not attachment
    assert isinstance(attachment.url(), six.string_types) is True
    # No mime-type and/or filename over-ride was specified, so therefore it
    # won't show up in the generated URL
    assert re.search(r'[?&]mime=', attachment.url()) is None
    assert re.search(r'[?&]name=', attachment.url()) is None
    assert attachment.mimetype is None
    assert attachment.name is None
    assert len(attachment) == 0

    # Disable our file size limitations
    AttachHTTP.max_file_size = 0
    results = AttachHTTP.parse_url('http://user@localhost')
    assert isinstance(results, dict)
    attachment = AttachHTTP(**results)
    assert isinstance(attachment.url(), six.string_types) is True
    # No mime-type and/or filename over-ride was specified, so therefore it
    # won't show up in the generated URL
    assert re.search(r'[?&]mime=', attachment.url()) is None
    assert re.search(r'[?&]name=', attachment.url()) is None
    assert attachment.mimetype == 'image/gif'
    # Because we could determine our mime type, we could build an extension
    # for our unknown filename
    assert attachment.name == 'myimage.gif'
    assert attachment
    assert len(attachment) == getsize(path)

    # Set our header up with an invalid Content-Length; we can still process
    # this data. It just means we track it lower when reading back content
    dummy_response.headers = {
        'Content-Length': 'invalid'
    }
    results = AttachHTTP.parse_url('http://localhost/invalid-length.gif')
    assert isinstance(results, dict)
    attachment = AttachHTTP(**results)
    assert isinstance(attachment.url(), six.string_types) is True
    # No mime-type and/or filename over-ride was specified, so therefore it
    # won't show up in the generated URL
    assert re.search(r'[?&]mime=', attachment.url()) is None
    assert re.search(r'[?&]name=', attachment.url()) is None
    assert attachment.mimetype == 'image/gif'
    # Because we could determine our mime type, we could build an extension
    # for our unknown filename
    assert attachment.name == 'invalid-length.gif'
    assert attachment

    # Give ourselves nothing to work with
    dummy_response.headers = {}
    results = AttachHTTP.parse_url('http://user@localhost')
    assert isinstance(results, dict)
    attachment = AttachHTTP(**results)
    # we can not download this attachment
    assert attachment
    assert isinstance(attachment.url(), six.string_types) is True
    # No mime-type and/or filename over-ride was specified, so therefore it
    # won't show up in the generated URL
    assert re.search(r'[?&]mime=', attachment.url()) is None
    assert re.search(r'[?&]name=', attachment.url()) is None

    # Handle edge-case where detected_name is None for whatever reason
    attachment.detected_name = None
    assert attachment.mimetype == attachment.unknown_mimetype
    assert attachment.name.startswith(AttachHTTP.unknown_filename)
    assert len(attachment) == getsize(path)

    # Exception handling
    mock_get.return_value = None
    for _exception in REQUEST_EXCEPTIONS:
        aa = AppriseAttachment.instantiate(
            'http://localhost/exception.gif?cache=30')
        assert isinstance(aa, AttachHTTP)

        mock_get.side_effect = _exception
        assert not aa

    # Restore value
    AttachHTTP.max_file_size = max_file_size
Exemplo n.º 35
0
def get_extension(image_type):
    try:
        return mimetypes.guess_extension(image_type)
    except:
        return None
Exemplo n.º 36
0
def guess_extension(mime: str) -> str:
    try:
        return sanity_overrides[mime]
    except KeyError:
        return mimetypes.guess_extension(mime)
Exemplo n.º 37
0
	# print(dir(email_message))
	to_=email_message["To"]
	from_=email_message["From"]
	subject_=email_message["Subject"]
	date_=email_message["date"]
	#payload is the message in a list
	#iterate the list to retrieve each message
	counter=0
	for part in email_message.walk():
		if part.get_content_maintype() == "multipart":
			continue
		filename= part.get_filename()
		content_type= part.get_content_type()
		if not filename:
			# ext='.html'
			ext= mimetypes.guess_extension(content_type)
			if not ext:
				ext=".bin"
			# if 'text' in content_type:
			# 	ext=".txt"
			# elif "html" in content_type:
			# 	ext=".html"
			filename='msg-part-%08d%s' %(counter, ext)
		counter+=1
	# save file
	save_path=os.path.join(os.getcwd(), "emails", date_, subject_)
	if not os.path.exists(save_path): #if the path doesnt exists
		os.makedirs(save_path)
	with open(os.path.join(save_path,filename),"wb") as fp:
		fp.write(part.get_payload(decode=True))
	# print(subject_)
Exemplo n.º 38
0
def guess_extension(mimetype):
    return OVERRIDE_MIMETYPES.get(mimetype,
                                  mimetypes.guess_extension(mimetype))
Exemplo n.º 39
0
def save_image(counter,
               url,
               response,
               datasetpath,
               name,
               image_id,
               face_id,
               bbox,
               save_face=False):
    """Save image

    Full images saved to datasetpath/images/name_image_id.ext
    Face images saved to datasetpath/faces/name_image_id_face_id.ext

    Returns True if successful else False

    """

    logger = logging.getLogger("logger")

    # Output dir for images is datasetpath/images/name
    output_dir = os.path.join(datasetpath, "images", name)
    ensure_dir_exists(output_dir)

    # Filename without extension
    filename = "{name}_{image_id}".format(name=name, image_id=image_id)
    outpath = os.path.join(output_dir, filename)

    # Save file without file extension
    with open(outpath, 'wb') as outfile:
        outfile.write(response.content)

    filetype = imghdr.what(outpath)

    # Cannot determine filetype.
    if filetype is None and not has_magic_lib:
        os.remove(outpath)
        logger.error("Line {number}: Cannot determine file type: {url}".format(
            number=counter, url=url))
        return False

    # Get filetype using lib magic
    elif filetype is None and has_magic_lib:
        mimetype = magic.from_buffer(response.content, mime=True)
        if mimetype is None:
            logger.error(
                "Line {number}: Cannot determine file type: {url}".format(
                    number=counter, url=url))
            return False

        ext = mimetypes.guess_extension(mimetype).lstrip('.')
        if ext is None:
            logger.error(
                "Line {number}: Cannot determine file type: {url}".format(
                    number=counter, url=url))
            return False
        elif ext == "jpe":
            filetype = "jpeg"

    # Rename file to have extension
    newpath = "{}.{}".format(outpath, filetype)
    shutil.move(outpath, newpath)

    # If user wants face images
    if save_face:
        try:
            I = Image.open(newpath)
            output_dir = os.path.join(datasetpath, "faces", name)
            ensure_dir_exists(output_dir)
            filename = "{name}_{image_id}_{face_id}.{ext}".format(
                name=name, image_id=image_id, face_id=face_id, ext=filetype)
            I.crop(bbox).save(os.path.join(output_dir, filename))
        except IOError as e:
            logger.error("Line {number}: {error}: {url}".format(number=counter,
                                                                error=e,
                                                                url=url))
            return False

    return True
Exemplo n.º 40
0
def get_info_from_file_reference(file_reference, **kwargs):
    #sys.stderr.write('file reference is ' + str(file_reference) + "\n")
    #logmessage('file reference is ' + str(file_reference))
    if 'convert' in kwargs:
        convert = kwargs['convert']
    else:
        convert = None
    if 'privileged' in kwargs:
        privileged = kwargs['privileged']
    else:
        privileged = None
    has_info = False
    if re.search(r'^[0-9]+$', str(file_reference)):
        if 'uids' in kwargs:
            uids = kwargs['uids']
        else:
            uids = None
        if uids is None or len(uids) == 0:
            new_uid = docassemble.base.functions.get_uid()
            if new_uid is not None:
                uids = [new_uid]
            else:
                uids = []
        if 'filename' in kwargs:
            result = get_info_from_file_number(int(file_reference),
                                               privileged=privileged,
                                               filename=kwargs['filename'],
                                               uids=uids)
        else:
            result = get_info_from_file_number(int(file_reference),
                                               privileged=privileged,
                                               uids=uids)
        if 'fullpath' not in result:
            result['fullpath'] = None
        has_info = True
    elif re.search(r'^https?://', str(file_reference)):
        #logmessage("get_info_from_file_reference: " + str(file_reference) + " is a URL")
        possible_filename = re.sub(r'.*/', '', file_reference)
        if possible_filename == '':
            possible_filename = 'index.html'
        if re.search(r'\.', possible_filename):
            (possible_ext,
             possible_mimetype) = get_ext_and_mimetype(possible_filename)
            possible_ext = re.sub(r'[^A-Za-z0-9\.].*', '', possible_ext)
            #logmessage("get_info_from_file_reference: starting with " + str(possible_ext) + " and " + str(possible_mimetype))
        else:
            possible_ext = 'txt'
            possible_mimetype = 'text/plain'
        result = dict()
        temp_file = tempfile.NamedTemporaryFile(prefix="datemp",
                                                suffix='.' + possible_ext,
                                                delete=False)
        req = Request(file_reference,
                      headers={
                          'User-Agent':
                          docassemble.base.config.daconfig.get(
                              'user agent', 'curl/7.64.0')
                      })
        response = urlopen(req)
        temp_file.write(response.read())
        #(local_filename, headers) = urllib.urlretrieve(file_reference)
        result['fullpath'] = temp_file.name
        try:
            #result['mimetype'] = headers.gettype()
            result['mimetype'] = response.headers['Content-Type']
            #logmessage("get_info_from_file_reference: mimetype is " + str(result['mimetype']))
        except Exception as errmess:
            logmessage(
                "get_info_from_file_reference: could not get mimetype from headers"
            )
            result['mimetype'] = possible_mimetype
            result['extension'] = possible_ext
        if 'extension' not in result:
            #logmessage("get_info_from_file_reference: extension not in result")
            result['extension'] = re.sub(
                r'^\.', '', mimetypes.guess_extension(result['mimetype']))
            #logmessage("get_info_from_file_reference: extension is " + str(result['extension']))
        if re.search(r'\.', possible_filename):
            result['filename'] = possible_filename
        else:
            result['filename'] = possible_filename + '.' + result['extension']
        path_parts = os.path.splitext(result['fullpath'])
        result['path'] = path_parts[0]
        has_info = True
        #logmessage("get_info_from_file_reference: downloaded to " + str(result['fullpath']))
    else:
        #logmessage(str(file_reference) + " is not a URL")
        result = dict()
        question = kwargs.get('question', None)
        manual_package = kwargs.get('package', None)
        folder = kwargs.get('folder', None)
        the_package = None
        parts = file_reference.split(':')
        if len(parts) == 1:
            the_package = None
            if question is not None:
                the_package = question.from_source.package
            elif manual_package is not None:
                the_package = manual_package
            if the_package is None:
                the_package = docassemble.base.functions.get_current_package()
            if folder is None:
                m = re.search(r'^data/(templates|sources|static)/(.*)',
                              file_reference)
                if m:
                    folder = m.group(1)
                    file_reference = m.group(2)
            if folder is not None and not re.search(r'/', file_reference):
                file_reference = 'data/' + str(folder) + '/' + file_reference
            if the_package is not None:
                #logmessage("package is " + str(the_package))
                file_reference = the_package + ':' + file_reference
            else:
                #logmessage("package was null")
                file_reference = 'docassemble.base:' + file_reference
            if the_package is not None:
                result['package'] = the_package
        elif len(parts) == 2:
            result['package'] = parts[0]
        result['fullpath'] = docassemble.base.functions.static_filename_path(
            file_reference)
    # sys.stderr.write("path is " + str(result['fullpath']) + "\n")
    if result['fullpath'] is not None:  #os.path.isfile(result['fullpath'])
        if not has_info:
            result['filename'] = os.path.basename(result['fullpath'])
            ext_type, result['mimetype'] = get_ext_and_mimetype(
                result['fullpath'])
            path_parts = os.path.splitext(result['fullpath'])
            result['path'] = path_parts[0]
            result['extension'] = path_parts[1].lower()
            result['extension'] = re.sub(r'\.', '', result['extension'])
        #logmessage("Extension is " + result['extension'])
        if convert is not None and result['extension'] in convert:
            #logmessage("Converting...")
            if os.path.isfile(result['path'] + '.' +
                              convert[result['extension']]):
                #logmessage("Found conversion file ")
                result['extension'] = convert[result['extension']]
                result['fullpath'] = result['path'] + '.' + result['extension']
                ext_type, result['mimetype'] = get_ext_and_mimetype(
                    result['fullpath'])
            else:
                sys.stderr.write("Did not find file " + result['path'] + '.' +
                                 convert[result['extension']] + "\n")
                return dict()
        #logmessage("Full path is " + result['fullpath'])
        if os.path.isfile(result['fullpath']) and not has_info:
            add_info_about_file(result['fullpath'], result['path'], result)
    else:
        sys.stderr.write("File reference " + str(file_reference) +
                         " DID NOT EXIST.\n")
    return (result)
Exemplo n.º 41
0
    def _populate_projects(self, iter_obj, yr):
        """Loop through the iter_obj to and sort/clean data based project_id

        Produced a list of dictionaries. Sample:
        {'end': '2012-12-31', 'operating_unit_email': '*****@*****.**',
        'inst_id': '', 'operating_unit': 'Lithuania, Republic of',
        'iati_op_id': 'LT', 'inst_descr': '', 'start': '2005-01-01',
        'operating_unit_id': 'LTU',
        'operating_unit_website': 'http://www.undp.lt/',
        'project_id': '00038726', 'inst_type_id': '',
        'document_name': u'http://www.undp.org/content/dam/undp/documents/projects/LTU/00038726/RC fund.pdf'}

        Arguments:
        iter_obj - and iteratble etree object
        """
        counter = 0

        # Get sorted units
        report_units = self.get_and_sort(
            self.undp_export + '/report_units.csv', 'operating_unit')

        # sorting table for documents by importancy
        docs_sort = [
            'A02', 'A03', 'A04', 'A05', 'A01', 'A07', 'A08', 'A09', 'A06',
            'A11', 'A10'
        ]

        # Loop through each IATI activity in the XML
        for event, p in iter_obj:

            # IATI hierarchy used to determine if output or input1
            hierarchy = p.attrib['hierarchy']

            # Check for projects
            if hierarchy == '1':
                obj = Project()

                obj.project_id.value = self._grab_award_id(p[1].text)

                # Check if the project_id is unique
                if obj.project_id.value in self.projects.pks:
                    continue

                obj.fiscal_year.value.append(yr)
                obj.project_title.value = p.find(
                    obj.project_title.xml_key).text.lower()

                obj.project_descr.value = p.find(
                    obj.project_descr.xml_key).text
                documents = p.findall('./document-link')

                if documents:
                    names = []
                    links = []
                    format = []
                    places = []

                    for doc in documents:

                        # avoid adding circular links to the same site/project
                        if ('open.undp.org/#project/' +
                                obj.project_id.value) not in doc.get('url'):
                            try:
                                links.append(
                                    urllib2.unquote(doc.get('url')).encode(
                                        'utf-8').decode('utf-8'))
                            except UnicodeDecodeError:
                                links.append(
                                    urllib2.unquote(
                                        doc.get('url')).decode('utf-8'))
                            #links.append(doc.get('url'))

                            if 'application/' in doc.get('format'):
                                ft = mimetypes.guess_extension(
                                    doc.get('format'), False)
                                if ft is None:
                                    format.append('')
                                else:
                                    format.append(ft.lstrip('.'))
                            else:
                                format.append('')

                            for d in doc.iterchildren(
                                    tag=obj.document_name.key):
                                names.append(d.text)

                            # default place is last
                            place = 100
                            for t in doc.iterchildren(tag='category'):
                                try:
                                    tp = docs_sort.index(t.get('code'))
                                except ValueError:
                                    tp = 100
                                if (tp < place):
                                    place = tp

                            places.append(place)

                    obj.document_name.value.extend(
                        [names, links, format, places])

                # Find start and end dates
                obj.start.value = p.find(obj.start.xml_key).text
                obj.end.value = p.find(obj.end.xml_key).text

                contact = p.findall('./contact-info')
                obj.operating_unit_email.value = [
                    e.text for email in contact for e in email.iterchildren(
                        tag=obj.operating_unit_email.key)
                ][0]

                # Find operating_unit
                # If recipient country didn't exist look for recipient region
                try:
                    obj.iati_op_id.value = (p.find(
                        obj.iati_op_id.xml_key).attrib.get('code'))
                    obj.operating_unit.value = p.find(
                        obj.operating_unit.xml_key).text
                    for r in report_units:
                        if (obj.iati_op_id.value == r['iati_operating_unit'] or
                                obj.iati_op_id.value == r['operating_unit']):
                            obj.operating_unit_id.value = r['operating_unit']
                            obj.region_id.value = r[obj.region_id.key]

                except:
                    region_unit = p.findall("./recipient-region")
                    for ru in region_unit:
                        for r in report_units:
                            if type(ru.text) == type(
                                    r['ou_descr']
                            ) and ru.text == r['ou_descr']:
                                obj.operating_unit_id.value = r[
                                    'operating_unit']
                                obj.operating_unit.value = r['ou_descr']
                    obj.iati_op_id.value = '998'

                # find contact info
                try:
                    for email in contact:
                        for e in email.iterchildren(
                                tag=obj.operating_unit_email.key):
                            obj.operating_unit_email.value = e.text

                    obj.operating_unit_website.value = p.find(
                        obj.operating_unit_website.xml_key).text
                except:
                    pass

                # Check for implementing organization
                try:
                    inst = p.find("./participating-org[@role='Implementing']")
                    obj.inst_id.value = inst.attrib.get(obj.inst_id.key)
                    obj.inst_type_id.value = inst.attrib.get(
                        obj.inst_type_id.key)
                    obj.inst_descr.value = inst.text
                except:
                    pass

                # Populate the Unit Collection
                self._populate_units(obj)

                counter += 1
                self.log('Processing: %s' % counter, True)

                self.projects.add(obj.project_id.value, obj)

        self.log('%s - Project Annuals: %s rows processed' % (yr, counter))
Exemplo n.º 42
0
    def _get_dehydrated_message(self, msg, record):
        settings = utils.get_settings()

        new = EmailMessage()
        if msg.is_multipart():
            for header, value in msg.items():
                new[header] = value
            for part in msg.get_payload():
                new.attach(self._get_dehydrated_message(part, record))
        elif (settings['strip_unallowed_mimetypes']
              and not msg.get_content_type() in settings['allowed_mimetypes']):
            for header, value in msg.items():
                new[header] = value
            # Delete header, otherwise when attempting to  deserialize the
            # payload, it will be expecting a body for this.
            del new['Content-Transfer-Encoding']
            new[settings['altered_message_header']] = (
                'Stripped; Content type %s not allowed' %
                (msg.get_content_type()))
            new.set_payload('')
        elif ((msg.get_content_type() not in settings['text_stored_mimetypes'])
              or ('attachment' in msg.get('Content-Disposition', ''))):
            filename = None
            raw_filename = msg.get_filename()
            if raw_filename:
                filename = utils.convert_header_to_unicode(raw_filename)
            if not filename:
                extension = mimetypes.guess_extension(msg.get_content_type())
            else:
                _, extension = os.path.splitext(filename)
            if not extension:
                extension = '.bin'

            attachment = MessageAttachment()

            attachment.document.save(
                uuid.uuid4().hex + extension,
                ContentFile(
                    six.BytesIO(msg.get_payload(decode=True)).getvalue()))
            attachment.message = record
            for key, value in msg.items():
                attachment[key] = value
            attachment.save()

            placeholder = EmailMessage()
            placeholder[settings['attachment_interpolation_header']] = str(
                attachment.pk)
            new = placeholder
        else:
            content_charset = msg.get_content_charset()
            if not content_charset:
                content_charset = 'ascii'
            try:
                # Make sure that the payload can be properly decoded in the
                # defined charset, if it can't, let's mash some things
                # inside the payload :-\
                msg.get_payload(decode=True).decode(content_charset)
            except LookupError:
                logger.warning("Unknown encoding %s; interpreting as ASCII!",
                               content_charset)
                msg.set_payload(
                    msg.get_payload(decode=True).decode('ascii', 'ignore'))
            except ValueError:
                logger.warning(
                    "Decoding error encountered; interpreting %s as ASCII!",
                    content_charset)
                msg.set_payload(
                    msg.get_payload(decode=True).decode('ascii', 'ignore'))
            new = msg
        return new
Exemplo n.º 43
0
def get_extension_by_filename(filename):
    try:
        return mimetypes.guess_extension(mimetypes.guess_type(filename)[0])
    except:
        return None
Exemplo n.º 44
0
def main():
    fp = open("/tmp/mail.log", "a")
    #fp.write("The file is " + sys.argv[1] + "\n")
    try:
        with open(sys.argv[1], 'rU') as email_fp:
            msg = email.message_from_file(email_fp)
    except Exception as errMess:
        fp.write("Failed to read e-mail message: " + str(errMess) + "\n")
        sys.exit("Failed to read e-mail message")
    raw_date = msg.get('Date', msg.get('Resent-Date', None))
    addr_return_path = msg.get('Return-path', None)
    addr_reply_to = msg.get('Reply-to', None)
    addr_to = msg.get('Envelope-to', None)
    addr_from = msg.get('From', msg.get('Sender', None))
    subject = msg.get('Subject', None)
    fp.write("Message to " + str(addr_to) + "\n")
    #fp.write("From was " + str(addr_from) + "\n")
    #fp.write("Subject was " + str(subject) + "\n")
    to_recipients = list()
    for recipient in getaddresses(
            msg.get_all('to', []) + msg.get_all('resent-to', [])):
        to_recipients.append(dict(name=recipient[0], address=recipient[1]))
    cc_recipients = list()
    for recipient in getaddresses(
            msg.get_all('cc', []) + msg.get_all('resent-cc', [])):
        cc_recipients.append(dict(name=recipient[0], address=recipient[1]))
    recipients = list()
    for recipient in getaddresses(
            msg.get_all('to', []) + msg.get_all('cc', []) +
            msg.get_all('resent-to', []) + msg.get_all('resent-cc', [])):
        recipients.append(dict(name=recipient[0], address=recipient[1]))
    if addr_to is None and len(recipients):
        addr_to = recipients[0]['address']
    #fp.write("recipients are " + str(recipients) + "\n")
    if addr_to is not None:
        #fp.write("parsed envelope-to: " + str(parseaddr(addr_to)) + "\n")
        short_code = re.sub(r'@.*', '', parseaddr(addr_to)[1])
    else:
        short_code = None
    #fp.write("short code is " + str(short_code) + "\n")
    record = db.session.query(Shortener).filter_by(short=short_code).first()
    if record is None:
        fp.write("short code not found\n")
        sys.exit("short code not found")
        #fp.write("short code found\n")
    #file_number = get_new_file_number(record.uid, 'email', yaml_file_name=record.filename)
    ##fp.write("file number is " + str(file_number) + "\n")
    #saved_file_email = SavedFile(file_number, fix=True)
    if addr_from is not None:
        #fp.write("parsed from: " + str(parseaddr(addr_from)[1]) + "\n")
        addr_from = dict(name=parseaddr(addr_from)[0],
                         address=parseaddr(addr_from)[1])
    else:
        addr_from = dict(empty=True)
    if addr_return_path is not None:
        #fp.write("parsed return_path: " + str(parseaddr(addr_return_path)[1]) + "\n")
        addr_return_path = dict(name=parseaddr(addr_return_path)[0],
                                address=parseaddr(addr_return_path)[1])
    else:
        addr_return_path = dict(empty=True)
    #fp.write("return_path is " + str(addr_return_path) + "\n")
    if addr_reply_to is not None:
        #fp.write("parsed reply-to: " + str(parseaddr(addr_reply_to)[1]) + "\n")
        addr_reply_to = dict(name=parseaddr(addr_reply_to)[0],
                             address=parseaddr(addr_reply_to)[1])
        #fp.write("reply-to is " + str(addr_reply_to) + "\n")
    else:
        addr_reply_to = dict(empty=True)
    #fp.write("reply-to is " + str(addr_reply_to) + "\n")
    msg_current_time = datetime.datetime.now()
    if raw_date is not None:
        msg_date = datetime.datetime.fromtimestamp(mktime(parsedate(raw_date)))
        #fp.write("msg_date is " + str(msg_date) + "\n")
    else:
        msg_date = msg_current_time
        #fp.write("msg_date set to current time\n")
    headers = list()
    for item in msg.items():
        headers.append([item[0], item[1]])
    #fp.write("headers:\n" + json.dumps(headers) + "\n")

    email_record = Email(short=short_code,
                         to_addr=json.dumps(to_recipients),
                         cc_addr=json.dumps(cc_recipients),
                         from_addr=json.dumps(addr_from),
                         reply_to_addr=json.dumps(addr_reply_to),
                         return_path_addr=json.dumps(addr_return_path),
                         subject=subject,
                         datetime_message=msg_date,
                         datetime_received=msg_current_time)
    db.session.add(email_record)
    db.session.commit()

    save_attachment(record.uid, record.filename, 'headers.json',
                    email_record.id, 0, 'application/json', 'json',
                    json.dumps(headers))

    counter = 1
    for part in msg.walk():
        if part.get_content_maintype() == 'multipart':
            continue
        filename = part.get_filename()
        if part.get_content_type() == 'text/plain':
            ext = '.txt'
        else:
            ext = mimetypes.guess_extension(part.get_content_type())
        if not ext:
            ext = '.bin'
        if filename:
            filename = '%03d-%s' % (counter, secure_filename(filename))
        else:
            filename = '%03d-attachment%s' % (counter, ext)
        #fp.write("Filename is " + str(filename) + "\n")
        #fp.write("Content type is " + str(part.get_content_type()) + "\n")

        real_filename = re.sub(r'[0-9][0-9][0-9]-', r'', filename)
        real_ext = re.sub(r'^\.', r'', ext)
        save_attachment(record.uid, record.filename,
                        real_filename, email_record.id, counter,
                        part.get_content_type(), real_ext,
                        part.get_payload(decode=True))

        counter += 1
    fp.close()
    user = None
    if record.user_id is not None:
        user = db.session.query(UserModel).filter_by(id=record.user_id).first()
    if user is None:
        user_info = dict(email=None,
                         the_user_id='t' + str(record.temp_user_id),
                         theid=record.temp_user_id,
                         roles=list())
    else:
        user_info = dict(email=user.email,
                         roles=[role.name for role in user.roles],
                         the_user_id=user.id,
                         theid=user.id,
                         firstname=user.first_name,
                         lastname=user.last_name,
                         nickname=user.nickname,
                         country=user.country,
                         subdivisionfirst=user.subdivisionfirst,
                         subdivisionsecond=user.subdivisionsecond,
                         subdivisionthird=user.subdivisionthird,
                         organization=user.organization)
    result = docassemble.webapp.worker.background_action.delay(
        record.filename,
        user_info,
        record.uid,
        None,
        'http://localhost',
        'http://localhost',
        dict(action='incoming_email', arguments=dict(id=email_record.id)),
        extra=None)
Exemplo n.º 45
0
def parse(args):
    #Clear output warc file.
    if args.dump == "warc":
        if args.silence:
            print("Recording", args.dump, "to", args.output + ".")
        with open(args.output_path_sub + args.output, "wb"):
            pass

    for record in warc_records(args.string, args.path):
        try:
            #Filter out unwanted entries.
            if not checkFilter(args.filter, record):
                continue

            #Increment Index counters.
            if args.silence:
                inc("records")
                inc(record, "warc-type", "types")
                inc(record, "content_type", "warc-content")
                if record.http:
                    inc(record.http, "content_type", "http-content")
                    inc(record.http, "error", "status")

            #Dump records to file.
            if args.dump == "warc":
                with open(args.output_path_sub + args.output, "ab") as output:
                    record.write_to(output)

            if args.dump == "content":
                url = urlparse(unquote(record['WARC-Target-URI']))

                #Set up folder
                index = url.path.rfind("/") + 1
                file = url.path[index:]
                path = url.path[:index]

                #Process filename
                if "." not in file:
                    path += file
                    if not path.endswith("/"):
                        path += "/"

                    file = 'index.html'

                #Final fixes.
                path = path.replace(".", "-")
                host = url.hostname.replace('www.', '', 1)
                path = args.output_path_sub + host + path

                #Create new directories
                if not os.path.exists(path):
                    try:
                        os.makedirs(path)
                    except OSError:
                        path = "/".join([i[:25] for i in path.split("/")])
                        os.makedirs(path)

                #Test if file has a proper extension.
                index = file.index(".")
                suffix = file[index:]
                content = record.http.get("content_type", "")
                slist = mimetypes.guess_all_extensions(content)
                if suffix not in slist:
                    #Correct suffix if we can.
                    suffix = mimetypes.guess_extension(content)
                    if suffix:
                        file = file[:index] + suffix
                    else:
                        inc(record.http, "content_type", "unknown mime type")

                #Check for gzip compression.
                if record.http.get("content-encoding", None) == "gzip":
                    file += ".gz"

                path += file

                #If Duplicate file then insert numbers
                index = path.rfind(".")
                temp = path
                n = 0
                while os.path.isfile(temp):
                    n += 1
                    temp = path[:index] + "(" + str(n) + ")" + path[index:]
                path = temp

                #Write file.
                with open(path, 'wb') as fp:
                    record.http.write_payload_to(fp)

        except Exception:
            if args.error:
                if args.silence:
                    print("Error in record. Recording to error.warc.")
                with open(args.output_path_sub + "error.warc", "ab") as fp:
                    record.write_to(fp)
            else:
                raise

    #print results
    if args.silence:
        print("-----------------------------")
        for i in counts:
            print("\nCount of {}.".format(i))
            pprint(counts[i])
Exemplo n.º 46
0
    def _load_urllib(self, filename, kwargs):
        '''(internal) Loading a network file. First download it, save it to a
        temporary file, and pass it to _load_local().'''
        if PY2:
            import urllib2 as urllib_request

            def gettype(info):
                return info.gettype()
        else:
            import urllib.request as urllib_request

            def gettype(info):
                return info.get_content_type()

        proto = filename.split(':', 1)[0]
        if proto == 'smb':
            try:
                # note: it's important to load SMBHandler every time
                # otherwise the data is occasionally not loaded
                from smb.SMBHandler import SMBHandler
            except ImportError:
                Logger.warning(
                    'Loader: can not load PySMB: make sure it is installed')
                return
        import tempfile
        data = fd = _out_osfd = None
        try:
            _out_filename = ''

            if proto == 'smb':
                # read from samba shares
                fd = urllib_request.build_opener(SMBHandler).open(filename)
            else:
                # read from internet
                request = urllib_request.Request(filename)
                if Config.has_option('network', 'useragent'):
                    useragent = Config.get('network', 'useragent')
                    if useragent:
                        request.add_header('User-Agent', useragent)
                opener = urllib_request.build_opener()
                fd = opener.open(request)

            if '#.' in filename:
                # allow extension override from URL fragment
                suffix = '.' + filename.split('#.')[-1]
            else:
                ctype = gettype(fd.info())
                suffix = mimetypes.guess_extension(ctype)
                suffix = LoaderBase.EXT_ALIAS.get(suffix, suffix)
                if not suffix:
                    # strip query string and split on path
                    parts = filename.split('?')[0].split('/')[1:]
                    while len(parts) > 1 and not parts[0]:
                        # strip out blanks from '//'
                        parts = parts[1:]
                    if len(parts) > 1 and '.' in parts[-1]:
                        # we don't want '.com', '.net', etc. as the extension
                        suffix = '.' + parts[-1].split('.')[-1]
            _out_osfd, _out_filename = tempfile.mkstemp(prefix='kivyloader',
                                                        suffix=suffix)

            idata = fd.read()
            fd.close()
            fd = None

            # write to local filename
            write(_out_osfd, idata)
            close(_out_osfd)
            _out_osfd = None

            # load data
            data = self._load_local(_out_filename, kwargs)

            # FIXME create a clean API for that
            for imdata in data._data:
                imdata.source = filename
        except Exception as ex:
            Logger.exception('Loader: Failed to load image <%s>' % filename)
            # close file when remote file not found or download error
            try:
                if _out_osfd:
                    close(_out_osfd)
            except OSError:
                pass

            # update client
            for c_filename, client in self._client[:]:
                if filename != c_filename:
                    continue
                # got one client to update
                client.image = self.error_image
                client.dispatch('on_error', error=ex)
                self._client.remove((c_filename, client))

            return self.error_image
        finally:
            if fd:
                fd.close()
            if _out_osfd:
                close(_out_osfd)
            if _out_filename != '':
                unlink(_out_filename)

        return data
Exemplo n.º 47
0
 def getExtension(mimetype):
     ext = mimetypes.guess_extension(mimetype.split(';')[0])
     if ext is None:
         raise Exception("Unsupported/unrecognized mimetype: " + mimetype)
     return ext
def process_apids(apid_matches, *, session, csv_writer, logger):
    """
    Given a list of APID tuples as returned by `process_gedcom_text()`,
    an active session, and a csv writer, it downloads images from Ancestry.com.

    Presumes the current directory of `os` is the output directory.

    Returns a list of apids with errors.
    """

    total_apid_matches = len(apid_matches)
    processed_apids = defaultdict(
        list)  # A dict with dbids as keys, and items as a list of pids.
    iid_regex = re.compile(r"var iid='([^\s']+)';")
    processed_iids = {
    }  # A dict with IID's as keys, and the following object as items.

    class processed_iid(object):
        def __init__(self, extension, apids=[]):
            self.extension = extension
            self.apids = apids

    problem_apids = set()

    # Process each apid.
    for i, match in enumerate(apid_matches, start=1):

        sour, apid, indiv, dbid, pid = match

        fields = {
            'sour': sour,
            'apid': apid,
            'indiv': indiv,
            'dbid': dbid,
            'pid': pid,
        }

        logger.info("Processing APID {0} of {1} <APID {2}>...".format(
            i, total_apid_matches, apid))

        # Check if the apid has previously been processed.
        if dbid in processed_apids and pid in processed_apids[dbid]:
            logger.info(
                "    > APID previously processed as part of another source.")
            logger.info("    > Finished!")
            continue
        else:
            # Mark the apid as processed now, so even if something fails, we know not to check it again.
            processed_apids[dbid].append(pid)

        # Visit the record page corresponding to the app id.
        logger.info("    > Getting the record page for the APID...")
        record_page = session.get(
            'http://search.ancestry.com/cgi-bin/sse.dll?indiv={0}&dbid={1}&h={2}'
            .format(indiv, dbid, pid))
        if record_page.status_code != 200:
            logger.error(
                "    > There was an error when trying to get the record page for the APID."
            )
            problem_apids.add(apid)
            logger.info("    > Aborted!")
            continue

        # Extract the image id associated with the record from the returned html.
        logger.info(
            "    > Processing the record page to determine the image ID...")
        match = iid_regex.search(record_page.text)

        if not match:
            # TODO, more and better checks could be performed rather than presuming there is no image at this stage, such as checking for a thumbnail.
            logger.info(
                "    > An image ID could not be found. Either the record does not have an image, or the record page was in an unexpected format."
            )
            fields['image'] = ''
            fields['extension'] = ''
            logger.info("    > Writing results to CSV file...")
            csv_writer.writerow(fields)
            logger.info("    > Finished!")
            continue

        fields['image'] = iid = match.group(1)

        # Check if the iid has previously been processed.
        if iid in processed_iids:
            logger.info(
                "    > The image for this record has previously been processed."
            )
            fields['extension'] = processed_iids[iid].extension
            logger.info("    > Writing results to CSV file...")
            csv_writer.writerow(fields)
            processed_iids[iid].apids.append(apid)
            logger.info("    > Finished!")
            continue
        else:
            # Mark the iid as processed now, so even if something fails, we know not to check it again.
            processed_iids[iid] = processed_iid(None, [apid])

        # Get the api data related to the image.
        logger.info("    > Get information regarding the image...")
        image_page = session.get(
            'http://www.ancestry.com/interactive/api/v2/Media/GetMediaInfo/{0}/{1}/{2}'
            .format(dbid, iid, pid))
        if record_page.status_code != 200:
            logger.error(
                "    > There was an error when trying to get the image info.")
            problem_apids.add(apid)
            logger.info("    > Aborted!")
            continue

        # Extract the download url for the returned json.
        logger.info("    > Processing the image information...")
        image_page_json = image_page.json()
        try:
            download_url = image_page_json['ImageServiceUrlForDownload']
        except KeyError:
            logger.error(
                "    > There was an error when trying to get the download URL from the image info."
            )
            problem_apids.add(apid)
            logger.info("    > Aborted!")
            continue

        # Download the image.
        logger.info("    > Downloading image...")
        image_download = session.get(download_url, stream=True)

        if image_download.status_code != 200:
            logger.error(
                "    > There was an error when trying to download the image.")
            problem_apids.add(apid)
            logger.info("    > Aborted!")
            continue

        # Save the image to a file.
        logger.info("    > Saving image...")

        # Ensure the dbid has a folder for saving the image into.
        if not os.path.exists(dbid):
            os.makedirs(dbid)

        content_type = image_download.headers['content-type']
        extension = mimetypes.guess_extension(content_type).strip('.')
        if extension == 'jpeg' or extension == 'jpe':
            extension = 'jpg'
        fields['extension'] = extension
        # Ensure the extension has been recorded for later use.
        if processed_iids[iid].extension == None:
            processed_iids[iid].extension = extension

        try:
            with open("{0}/{1}.{2}".format(dbid, iid, extension), 'wb') as f:
                for chunk in image_download.iter_content(1024):
                    f.write(chunk)
        except Exception as e:
            logger.error(
                '    > There was an unknown error when saving the file: ' +
                str(e))
            logger.info("    > Aborted!")
            continue

        logger.info("    > Image file saved successfully.")

        # Write results to csv file.
        logger.info("    > Writing results to CSV file...")
        csv_writer.writerow(fields)
        logger.info("    > Finished!")

    # All done.
    return problem_apids
Exemplo n.º 49
0
    def binary_content(cls, xmlid=None, model='ir.attachment', id=None, field='datas',
                       unique=False, filename=None, filename_field='datas_fname', download=False,
                       mimetype=None, default_mimetype='application/octet-stream',
                       access_token=None, related_id=None, access_mode=None, env=None):
        """ Get file, attachment or downloadable content

        If the ``xmlid`` and ``id`` parameter is omitted, fetches the default value for the
        binary field (via ``default_get``), otherwise fetches the field for
        that precise record.

        :param str xmlid: xmlid of the record
        :param str model: name of the model to fetch the binary from
        :param int id: id of the record from which to fetch the binary
        :param str field: binary field
        :param bool unique: add a max-age for the cache control
        :param str filename: choose a filename
        :param str filename_field: if not create an filename with model-id-field
        :param bool download: apply headers to download the file
        :param str mimetype: mintype of the field (for headers)
        :param related_id: the id of another record used for custom_check
        :param  access_mode: if truthy, will call custom_check to fetch the object that contains the binary.
        :param str default_mimetype: default mintype if no mintype found
        :param str access_token: optional token for unauthenticated access
                                 only available  for ir.attachment
        :param Environment env: by default use request.env
        :returns: (status, headers, content)
        """
        env = env or request.env
        # get object and content
        obj = None
        if xmlid:
            obj = cls._xmlid_to_obj(env, xmlid)
        elif id and model in env.registry:
            obj = env[model].browse(int(id))
        # obj exists
        if not obj or not obj.exists() or field not in obj:
            return (404, [], None)

        # access token grant access
        if model == 'ir.attachment' and access_token:
            obj = obj.sudo()
            if access_mode:
                if not cls._check_access_mode(env, id, access_mode, model, access_token=access_token,
                                             related_id=related_id):
                    return (403, [], None)
            elif not consteq(obj.access_token or u'', access_token):
                return (403, [], None)

        # check read access
        try:
            last_update = obj['__last_update']
        except AccessError:
            return (403, [], None)

        status, headers, content = None, [], None

        # attachment by url check
        module_resource_path = None
        if model == 'ir.attachment' and obj.type == 'url' and obj.url:
            url_match = re.match("^/(\w+)/(.+)$", obj.url)
            if url_match:
                module = url_match.group(1)
                module_path = get_module_path(module)
                module_resource_path = get_resource_path(module, url_match.group(2))
                if module_path and module_resource_path:
                    module_path = os.path.join(os.path.normpath(module_path), '')  # join ensures the path ends with '/'
                    module_resource_path = os.path.normpath(module_resource_path)
                    if module_resource_path.startswith(module_path):
                        with open(module_resource_path, 'rb') as f:
                            content = base64.b64encode(f.read())
                        last_update = pycompat.text_type(os.path.getmtime(module_resource_path))

            if not module_resource_path:
                module_resource_path = obj.url

            if not content:
                status = 301
                content = module_resource_path
        else:
            content = obj[field] or ''

        # filename
        default_filename = False
        if not filename:
            if filename_field in obj:
                filename = obj[filename_field]
            if not filename and module_resource_path:
                filename = os.path.basename(module_resource_path)
            if not filename:
                default_filename = True
                filename = "%s-%s-%s" % (obj._name, obj.id, field)

        # mimetype
        mimetype = 'mimetype' in obj and obj.mimetype or False
        if not mimetype:
            if filename:
                mimetype = mimetypes.guess_type(filename)[0]
            if not mimetype and getattr(env[model]._fields[field], 'attachment', False):
                # for binary fields, fetch the ir_attachement for mimetype check
                attach_mimetype = env['ir.attachment'].search_read(domain=[('res_model', '=', model), ('res_id', '=', id), ('res_field', '=', field)], fields=['mimetype'], limit=1)
                mimetype = attach_mimetype and attach_mimetype[0]['mimetype']
            if not mimetype:
                try:
                    decoded_content = base64.b64decode(content)
                except base64.binascii.Error:  # if we could not decode it, no need to pass it down: it would crash elsewhere...
                    return (404, [], None)
                mimetype = guess_mimetype(decoded_content, default=default_mimetype)

        # extension
        _, existing_extension = os.path.splitext(filename)
        if not existing_extension or default_filename:
            extension = mimetypes.guess_extension(mimetype)
            if extension:
                filename = "%s%s" % (filename, extension)

        headers += [('Content-Type', mimetype), ('X-Content-Type-Options', 'nosniff')]

        # cache
        etag = bool(request) and request.httprequest.headers.get('If-None-Match')
        retag = '"%s"' % hashlib.md5(pycompat.to_text(content).encode('utf-8')).hexdigest()
        status = status or (304 if etag == retag else 200)
        headers.append(('ETag', retag))
        headers.append(('Cache-Control', 'max-age=%s' % (STATIC_CACHE if unique else 0)))

        # content-disposition default name
        if download:
            headers.append(('Content-Disposition', cls.content_disposition(filename)))
        return (status, headers, content)
Exemplo n.º 50
0
def get_suffix_for(value: bytes):
    mime_type = magic.from_buffer(value, mime=True)
    extension = mimetypes.guess_extension(mime_type)
    return extension
Exemplo n.º 51
0
 def url_fetch_completed(cls, usr, url_name, directory,
                         archive_html, row, settings_row,
                         media_path, media_element, *args):
     ext = None
     save = False
     save_text = False
     favicon_link = None
     final_og_link = None
     summary = 'none'
     req = args[-1]
     tags_list = []
     save_summary = False
     if req and req.content_type:
         if ';' in req.content_type:
             content_type = req.content_type.split(';')[0].strip()
         else:
             content_type = req.content_type
         if content_type == 'text/plain':
             ext = '.txt'
         else:
             ext = guess_extension(content_type)
         logger.debug('{} ----> {}'.format(content_type, ext))
     if req and req.html and not req.binary:
         if 'text/html' in req.content_type:
             soup = BeautifulSoup(req.html, 'html.parser')
             if soup.title:
                 title = soup.title.text
                 if title.lower() == 'youtube':
                     try_srch = re.search('document.title[^;]*', req.html)
                     if try_srch:
                         title = try_srch.group().replace('document.title = ', '')
             else:
                 title = cls.unquote_title(url_name)
             ilink = soup.find('link', {'rel':'icon'})
             slink = soup.find('link', {'rel':'shortcut icon'})
             mlink = soup.find('meta', {'property':'og:image'})
             if mlink:
                 final_og_link = mlink.get('content', '')
             if ilink:
                 favicon_link = cls.format_link(ilink.get('href'), url_name)
             elif slink:
                 favicon_link = cls.format_link(slink.get('href'), url_name)
             else:
                 for link in soup.find_all('link'):
                     rel = link.get('href')
                     if (rel and (rel.endswith('.ico') or '.ico' in rel)):
                         favicon_link = cls.format_link(rel, url_name)
                 if not favicon_link:
                     urlp = urlparse(url_name)
                     favicon_link = urlp.scheme + '://' + urlp.netloc + '/favicon.ico'
                     
             if archive_html or (settings_row and settings_row.auto_archive):
                 save_text = True
             if settings_row and (settings_row.autotag or settings_row.auto_summary):
                 summary, tags_list = Summarizer.get_summary_and_tags(req.html,
                                                                      settings_row.total_tags)
         else:
             title = cls.unquote_title(url_name)
             save = True
     elif req and req.binary:
         title = cls.unquote_title(url_name)
         save = True
     else:
         ext = '.bin'
         title = url_name.rsplit('/', 1)[-1]
     if row is None:
         if settings_row and settings_row.reader_theme:
             reader_theme = settings_row.reader_theme
         else:
             reader_theme = UserSettings.WHITE
         row = Library.objects.create(usr=usr,
                                      directory=directory,
                                      url=url_name, title=title,
                                      summary=summary,
                                      timestamp=timezone.now(),
                                      media_element=media_element,
                                      reader_mode=reader_theme)
     else:
         logger.debug('row - exists')
     if not media_path:
         if ext and ext.startswith('.'):
             out_dir = ext[1:].upper()
         else:
             out_dir = str(ext).upper()
         if not ext:
             print(req.content_type)
         out_title = str(row.id) + str(ext)
         media_dir = os.path.join(settings.ARCHIVE_LOCATION, out_dir)
         if not os.path.exists(media_dir):
             os.makedirs(media_dir)
         if not os.path.exists(settings.FAVICONS_STATIC):
             os.makedirs(settings.FAVICONS_STATIC)
         media_path_parent = os.path.join(media_dir, str(row.id))
         final_favicon_path = os.path.join(settings.FAVICONS_STATIC, str(row.id) + '.ico')
         final_og_image_path = os.path.join(settings.FAVICONS_STATIC, str(row.id) + '.png')
         media_path = os.path.join(media_path_parent, out_title)
         row.media_path = media_path
         row.save()
         if favicon_link and favicon_link.startswith('http'):
             cls.vnt.get(favicon_link, out=final_favicon_path)
         logger.debug(final_og_link)
         if final_og_link and final_og_link.startswith('http'):
             cls.vnt.get(final_og_link, out=final_og_image_path)
     elif media_path and row:
         final_favicon_path = os.path.join(settings.FAVICONS_STATIC, str(row.id) + '.ico')
         final_og_image_path = os.path.join(settings.FAVICONS_STATIC, str(row.id) + '.png')
         media_path_parent, out_title = os.path.split(media_path)
         if settings_row and settings_row.auto_summary and summary:
             row.summary = summary
         if settings_row and not tags_list:
             row.save()
         else:
             save_summary = True
         if (not os.path.exists(final_favicon_path)
                 and favicon_link and favicon_link.startswith('http')):
             cls.vnt.get(favicon_link, out=final_favicon_path)
         if (not os.path.exists(final_og_image_path)
                 and final_og_link and final_og_link.startswith('http')):
             cls.vnt.get(final_og_link, out=final_og_image_path)
     if save or save_text:
         if not os.path.exists(media_path_parent):
             os.makedirs(media_path_parent)
         if save:
             cls.vnt.get(url_name, out=media_path)
         else:
             with open(media_path, 'w') as fd:
                 fd.write(req.html)
         if settings_row and ext in ['.htm', '.html']:
             cls.convert_html_pdf(media_path_parent, settings_row,
                                  row, url_name, media_path, media_element)
     if settings_row and tags_list:
         if save_summary:
             cls.edit_tags(usr, row.id, ','.join(tags_list), '', old_row=row)
         else:
             cls.edit_tags(usr, row.id, ','.join(tags_list), '')
     return row.id
Exemplo n.º 52
0
def export_inbox(connection, fetch_uid, fetch_protocol):

    sp_ch = [
        '<', '>', '?', '*', ':', '|', '/', '"', '\\', '\r', '\n', '\t', '\b',
        '\a'
    ]

    success_var, email_data = connection.uid('fetch', fetch_uid,
                                             fetch_protocol)

    email_data = email.message_from_bytes(email_data[0][1])

    email_info = {
        'Subject': '',
        'From': '',
        'To': '',
        'Date': '',
        'Encryption': ''
    }
    disposition = []

    for email_part in email_data.walk():

        if bool(email_part['Subject']):
            email_info['Subject'] = email_part['Subject']

        if bool(email_part['From']):
            email_info['From'] = email_part['From']

        if bool(email_part['To']):
            email_info['To'] = email_part['To']

        if bool(email_part['Date']):
            email_info['Date'] = email_part['Date']

        if bool(email_part['Content-Disposition']):
            disposition.append(email_part)

        if bool(email_part['Encryption']):
            email_info['Encryption'] = email_part['Encryption']

    email_subject = email_info['Subject']
    if not bool(email_subject):
        email_subject = '(no subject)'

    email_sender = email_info['From']
    email_sender = re.findall('[^<> ]+@[^<> ]+', email_sender)[0]
    email_sender = ''.join(x for x in email_sender if not x in sp_ch)

    email_receiver = email_info['To']
    if not bool(email_receiver):
        email_receiver = '(empty)'

    email_time = email_info['Date']
    sub_dirname = dateutil.parser.parse(email_time).astimezone(
        dateutil.tz.tzlocal()).strftime('%d.%m.%Y-%H.%M.%S')
    sub_dirname = f'{fetch_uid.decode("utf-8")}_{email_sender}_{sub_dirname}'
    datetime = dateutil.parser.parse(email_time).astimezone(
        dateutil.tz.tzlocal()).strftime('%d/%m/%Y %X')

    key = None
    if bool(email_info['Encryption']):

        print('\n\tThis E-Mail is secured by encryption.')
        print(
            f'\n\tHINTS:\n\t\tSender : {email_sender}\n\t\tSubject : {email_subject}'
        )
        print('\n\tYou have only 5 attempts to enter the correct password.')
        print('\tAfter that the E-Mail will be deleted.')
        print('\n\tBrowse for a key containing file instead (5 attempts) ?')
        response = yesno()

        if response in ['Y', 'y']:

            win = tkinter.Tk()
            win.geometry('200x50')
            win.title('Non functional window')

            label = tkinter.Label(win, text='IGNORE THIS WINDOW')
            label.pack()

            for i in range(5):
                print(f'\n\t[Attempt {i+1} of 5]')
                key_path = filedialog.askopenfilename(
                    initialdir=os.environ.get('userprofile'),
                    title='Select the file having the key',
                    filetypes=[('Binary', '*.bin')])
                with open(key_path, 'rb') as f:
                    key = f.read()
                #os.remove(key_path)
                if len(key) == 32:
                    print('\tKEY ACCEPTED\n\n\tDecrypting .....')
                    break
                else:
                    print('\tWRONG KEY')
                    key = None
                    continue
            win.destroy()

        else:

            for i in range(5):
                print(f'\n\t[Attempt {i+1} of 5]')
                anon_id = input('\n\tPASSWORD : '******'temp-{}'.format(random.randint(0,9999))
                        api_handler = AnonFile('api_key')
                        sys.stdout = open(os.devnull, 'w')
                        api_handler.download_file(
                            f'https://anonfiles.com/{anon_id}/{key_name}_bin')
                        sys.stdout = sys.__stdout__
                        #with open(api_res, 'r') as f :
                        #	dl_res = f.read()
                        #os.remove(api_res)

                        #if 'Error -- 403: Forbidden' in dl_res :
                        #	print('\tWRONG PASSWORD')
                        #	continue
                        if os.path.exists(f'{key_name}.bin'):
                            with open(f'{key_name}.bin', 'rb') as f:
                                key = f.read()
                            os.remove(f'{key_name}.bin')
                            print('\tPASSWORD ACCEPTED\n\n\tDecrypting .....')
                            break
                        else:
                            print('\tWRONG PASSWORD')
                            continue
                    else:
                        print('\tWRONG PASSWORD')
                        continue
                else:
                    print('\tWRONG PASSWORD')
                    continue

        if not key:
            print('\n\tAll attempts failed.\n\tDeleting the E-Mail .....')
            connection.uid('store', fetch_uid, '+FLAGS', '\\Deleted')
            connection.expunge()
            print('\tE-Mail deleted.')
            return

    # existence of key implies existence of encryption and vice-versa otherwise the function would have quit by now.

    sub_dirname = os.path.join(os.getcwd(), sub_dirname)
    check_isdir(sub_dirname)
    os.chdir(sub_dirname)

    attachment_dir = os.path.join(os.getcwd(), 'attachments')

    for part in disposition:

        filename = part.get_filename()

        if filename == None:
            filename = ''
        else:
            filename = os.path.basename(filename)
            if key:
                if filename.endswith('.encrypted'):
                    filename = filename.replace('.encrypted', '')
                else:
                    print(
                        '\nSomeone messed with the mail-sending source.\nAttachment names may be affected.'
                    )

        filename = ''.join(x for x in filename if not x in sp_ch)
        name, ext = os.path.splitext(filename)

        if not ext:
            ext = mimetypes.guess_extension(part.get_content_type())

        if not name:
            if 'attachment' in part['Content-Disposition']:
                name = 'untitled-attachment-{}'.format(random.randint(0, 9999))

            elif 'inline' in part['Content-Disposition']:
                name = 'untitled-inline-{}'.format(random.randint(0, 999))

        filename = name + ext

        if len(filename) > 255:
            name, ext = os.path.splitext(filename)
            name = name[:(255 - len(ext))]
            filename = name + ext

        check_isdir(attachment_dir)
        os.chdir(attachment_dir)

        with open(filename, 'wb') as file:
            payload = part.get_payload(decode=True)
            if key:
                payload = decrypt(payload, key)
            file.write(payload)

    os.chdir(sub_dirname)

    with open('main-body.html', 'wb') as file:

        initial = (
            f'<html><body>\nSubject\t:\t{email_subject}<br><br>From\t:\t{email_sender}<br><br>\
		To\t:\t{email_receiver}<br><br>Date\t:\t{datetime}<br><br>' + '*' * 200 +
            '<br><br>\n</body></html>\n\n').encode('utf-8')

        file.write(initial)

        for email_part in email_data.walk():

            if email_part.get_content_maintype() == 'text':

                if mimetypes.guess_extension(
                        email_part.get_content_type()) == '.html':
                    payload = email_part.get_payload(decode=True)
                    if key:
                        payload = base64.b64decode(payload)
                        payload = decrypt(payload, key)
                    file.write(payload)

                elif mimetypes.guess_extension(
                        email_part.get_content_type()) == '.txt':
                    tmp_name = ''.join(
                        chr(random.randint(97, 122)) for i in range(10))
                    tmp_file = open(f'{tmp_name}.txt', 'wb')
                    payload = email_part.get_payload(decode=True)
                    if key:
                        payload = base64.b64decode(payload)
                        payload = decrypt(payload, key)
                    tmp_file.write(payload)
                    tmp_file.close()

                    os.system('rst2html5 {0}.txt > {0}.html'.format(tmp_name))
                    tmp_file = open(f'{tmp_name}.html', 'rb')
                    file.write(tmp_file.read())
                    tmp_file.close()

                    os.remove(f'{tmp_name}.txt')
                    os.remove(f'{tmp_name}.html')
Exemplo n.º 53
0
def main():
	global args
	global config
	global es
	global verbose
	global rcode

	parser = argparse.ArgumentParser(
		description = 'Unpack MIME attachments from a file and check them against virustotal.com')
	parser.add_argument('-d', '--directory',
		dest = 'directory',
		help = 'directory where files will be extracted (default: /tmp) %%d,%%m,%%y can use used for dynamic names',
		metavar = 'DIRECTORY')
	parser.add_argument('-v', '--verbose',
		action = 'store_false',
		dest = 'verbose',
		help = 'verbose output',
		default = False)
	parser.add_argument('-c', '--config',
		dest = 'config_file',
		help = 'configuration file (default: /etc/mime2vt.conf)',
		metavar = 'CONFIG')
	parser.add_argument('-l', '--log',
		dest = 'dump_file',
		help = 'mail dump file (default /tmp/message.dump)',
		metavar = 'DUMPFILE')
	args = parser.parse_args()

	# Default values
	if not args.directory:
		args.directory = '/tmp'
	if not args.config_file:
		args.config_file = '/etc/mime2vt.conf'

	#writeLog('DEBUG: config_file = %s' % args.config_file)

	try:
		c = ConfigParser.ConfigParser()
		c.read(args.config_file)
		config['apiKey'] = c.get('virustotal', 'apikey')
		excludetypes = c.get('virustotal', 'exclude').split(',')
		# Elasticsearch config
		config['esServer'] = c.get('elasticsearch', 'server')
		config['esIndex'] = c.get('elasticsearch', 'index')
		config['dbPath'] = c.get('database', 'dbpath')
	except OSError as e:
		writeLog('Cannot read config file %s: %s' % (args.config_file, e.errno))
		exit

	if config['esServer']:
		logging.basicConfig()
		es = Elasticsearch([config['esServer']])

	# Create the SQLite DB
	dbCreate()

	# Read the mail flow from STDIN
	data = "" . join(sys.stdin)
	msg = email.message_from_string(data)

	if usePyzMail:
		mailheaders = parseMailheaders(data)

	if args.dump_file:
		try:
			fp = open(args.dump_file, 'a')
		except OSError as e:
			writeLog('Cannot dump message to %s: %s' % (args.dump_file, e.errno))
		fp.write(data)
		fp.close()

	# Process MIME parts
	for part in msg.walk():
		contenttype = part.get_content_type()
		filename = part.get_param('name')

		# Hack: Search for a .js extension
		try:
			fname, fextension = os.path.splitext(filename)
		except:
			fextension = "none"

		data = part.get_payload(None, True)
		if data:
			md5 = hashlib.md5(data).hexdigest()
			#if dbMD5Exists(md5):
			#	writeLog("Skipping existing MD5 %s" % md5)
			#	continue

			# New: Extract URLS
			if contenttype in [ 'text/html', 'text/plain' ]:
				urls = []
				# Source: https://gist.github.com/uogbuji/705383
				GRUBER_URLINTEXT_PAT = re.compile(ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
				lines = data.split('\n')
				for line in lines:
					try:
						#urls.append(re.search("(?P<url>https?://[^\s]+)", word).group("url"))
						for url in GRUBER_URLINTEXT_PAT.findall(line):
							if url[0]:
								urls.append(url[0])
					except:
						pass
				fp = open('/var/tmp/urls.log', 'a')
				for url in urls:
					fp.write("%s\n" % url)
				fp.close()

			# Process only interesting files
			# if contenttype not in ('text/plain', 'text/html', 'image/jpeg', 'image/gif', 'image/png'):
			if contenttype not in excludetypes or fextension == '.js':
				if not filename:
					filename = md5
				mime_ext = mimetypes.guess_extension(contenttype)
				if not mime_ext:
					# Use a generic bag-of-bits extension
					mime_ext = '.bin'
				f_name, f_ext = os.path.splitext(filename)
				if not f_ext:
					filename += mime_ext

				writeLog('Found interesting file: %s (%s)' % (filename, contenttype))

				fp = open(os.path.join(generateDumpDirectory(args.directory), filename), 'wb')
				fp.write(data)
				fp.close()

				if contenttype in ['application/zip', 'application/x-zip-compressed']:
					# Process ZIP archive
					writeLog('Processing zip archive: %s' % filename)
					processZipFile(os.path.join(generateDumpDirectory(args.directory), filename))
				else:
					# Check VT score
					vt = VirusTotalPublicApi(config['apiKey'])
					response = vt.get_file_report(md5)

					# Save results to Elasticsearch
					if config['esServer']:
						try:
							response['@timestamp'] = time.strftime("%Y-%m-%dT%H:%M:%S+01:00")
							response['filename'] = filename
							if usePyzMail:
								response['mail'] = mailheaders
							res = es.index(index=config['esIndex'], doc_type="VTresult", body=json.dumps(response))
						except:
							writeLog("Cannot index to Elasticsearch")

					# DEBUG
					fp = open('/tmp/vt.debug', 'a')
					fp.write(json.dumps(response, sort_keys=False, indent=4))
					fp.close()

					vtScore = "0/0"
					if response['response_code'] == 200:
						if response['results']['response_code']:
							positives = response['results']['positives']
							total = response['results']['total']
							scan_date = response['results']['scan_date']
							vtScore = str(positives) + "/" + str(total)
							if positives > 0:
								rcode = 1

							writeLog('File: %s (%s) Score: %s Scanned: %s (%s)' %
								(filename, md5, vtScore, scan_date, timeDiff(scan_date)))
						else:
							# Do not resubmit existing MD5
							if !dbMD5Exists(md5):
								writeLog('File: %s (%s) not found, submited for scanning' %
									(filename, md5))
								submit2vt(os.path.join(generateDumpDirectory(args.directory), filename))
						dbAddMD5(md5, filename, vtScore)
					else:
						writeLog('VT Error: %s' % response['error'])

					# Analyze OLE documents if API is available
					parseOLEDocument(os.path.join(generateDumpDirectory(args.directory), filename))
Exemplo n.º 54
0
def makesane(row):

    # replace % characters in URL eg %20 by space
    urlpath = urllib2.unquote(row[3])
    urlprefix = "http://10.129.50.5/nvli/data/"

    # Strip urlprefix to find the relative path in local partition
    srcfile = re.sub(urlprefix, '', urlpath)

    # the local partition
    srcdir = "/NFSMount/SV-Patel_Data/nvli"
    srcpath = '/'.join([srcdir, srcfile])

    title = row[10].strip()
    title = title.rstrip('.')
    title = title.strip()

    # mimetypes library does not seem to use magic, so this does not work
    #fmt = mimetypes.guess_extension(mimetypes.guess_type(srcpath)[0])

    ## https://github.com/ahupp/python-magic
    ## pip install python-magic

    try:
        ext = mimetypes.guess_extension(magic.from_file(srcpath, mime=True))
    except EnvironmentError:
        # parent of IOError, OSError *and* WindowsError where available
        # print 'Error, file not found: %s' % srcpath
        ext = "." + row[20].lower()
    except:
        print "For from_file missing error: pip install python-magic"
        sys.exit(0)

    # some standard mappings where the default provided by python is
    # not conventional (eg: .jpe for .jpeg or .jpg
    ext = extnmap(ext)

    # rename the original extension to that of magic
    row[20] = ext.lstrip('.')

    # made from uniq ID, title and extension
    filename = sanefilename([title])  # remove common words
    # get the first n words of title
    filename = '-'.join(filename.split('-')[:6])

    # prefix and suffix
    filename = sanefilename([row[9], filename]) + ext
    #filename = sanefilename([row[9], title]) + ext
    # because sanefilename strips . from filename

    # source
    sourceid = row[6]
    dirname = sanefilename([sourceid])
    # create a subdir for each hyphenated part of uniq ID
    dirname = re.sub(r"-", '/', dirname)

    relpath = 'archive/' + dirname + '/' + filename

    destroot = "/NFSMount/sardar/files"
    destpath = '/'.join([destroot, relpath])

    dirname = os.path.dirname(destpath)
    if not os.path.exists(dirname):
        os.makedirs(dirname)

    #os.rename is a mv and needs permissions to delete srcpath
    print 'Copying from %s to \n\t%s ' % (srcpath, destpath)
    sys.stdout.flush()

    try:
        shutil.copyfile(srcpath, destpath)
        sane = [row[9], relpath, row[4]] + row[10:]
    # eg. src and dest are the same file
    except shutil.Error as e:
        print('Error: %s' % e)
        sane = "Error"
    # eg. source or destination doesn't exist
    except IOError as e:
        print('Error: %s, %s' % (srcpath, e.strerror))
        sane = "Error"

    #sane = [filename]
    return sane
Exemplo n.º 55
0
def _download_http_url(
        link,  # type: Link
        session,  # type: PipSession
        temp_dir,  # type: str
        hashes,  # type: Hashes
        progress_bar  # type: str
):
    # type: (...) -> Tuple[str, str]
    """Download link url into temp_dir using provided session"""
    target_url = link.url.split('#', 1)[0]
    try:
        resp = session.get(
            target_url,
            # We use Accept-Encoding: identity here because requests
            # defaults to accepting compressed responses. This breaks in
            # a variety of ways depending on how the server is configured.
            # - Some servers will notice that the file isn't a compressible
            #   file and will leave the file alone and with an empty
            #   Content-Encoding
            # - Some servers will notice that the file is already
            #   compressed and will leave the file alone and will add a
            #   Content-Encoding: gzip header
            # - Some servers won't notice anything at all and will take
            #   a file that's already been compressed and compress it again
            #   and set the Content-Encoding: gzip header
            # By setting this to request only the identity encoding We're
            # hoping to eliminate the third case. Hopefully there does not
            # exist a server which when given a file will notice it is
            # already compressed and that you're not asking for a
            # compressed file and will then decompress it before sending
            # because if that's the case I don't think it'll ever be
            # possible to make this work.
            headers={"Accept-Encoding": "identity"},
            stream=True,
        )
        resp.raise_for_status()
    except requests.HTTPError as exc:
        logger.critical(
            "HTTP error %s while getting %s",
            exc.response.status_code,
            link,
        )
        raise

    content_type = resp.headers.get('content-type', '')
    filename = link.filename  # fallback
    # Have a look at the Content-Disposition header for a better guess
    content_disposition = resp.headers.get('content-disposition')
    if content_disposition:
        type, params = cgi.parse_header(content_disposition)
        # We use ``or`` here because we don't want to use an "empty" value
        # from the filename param.
        filename = params.get('filename') or filename
    ext = splitext(filename)[1]
    if not ext:
        ext = mimetypes.guess_extension(content_type)
        if ext:
            filename += ext
    if not ext and link.url != resp.url:
        ext = os.path.splitext(resp.url)[1]
        if ext:
            filename += ext
    file_path = os.path.join(temp_dir, filename)
    with open(file_path, 'wb') as content_file:
        _download_url(resp, link, content_file, hashes, progress_bar)
    return file_path, content_type
Exemplo n.º 56
0
    def output(self, task, entry, config):
        """Moves temp-file into final destination

        Raises:
            PluginError if operation fails
        """

        if 'file' not in entry and not task.options.test:
            log.debug('file missing, entry: %s', entry)
            raise plugin.PluginError(
                'Entry `%s` has no temp file associated with' % entry['title'])

        try:
            # use path from entry if has one, otherwise use from download definition parameter
            path = entry.get('path', config.get('path'))
            if not isinstance(path, str):
                raise plugin.PluginError('Invalid `path` in entry `%s`' %
                                         entry['title'])

            # override path from command line parameter
            if task.options.dl_path:
                path = task.options.dl_path

            # expand variables in path
            try:
                path = os.path.expanduser(entry.render(path))
            except RenderError as e:
                entry.fail(
                    'Could not set path. Error during string replacement: %s' %
                    e)
                return

            # Clean illegal characters from path name
            path = pathscrub(path)

            # If we are in test mode, report and return
            if task.options.test:
                log.info('Would write `%s` to `%s`', entry['title'], path)
                # Set a fake location, so the exec plugin can do string replacement during --test #1015
                entry['location'] = os.path.join(path, 'TEST_MODE_NO_OUTPUT')
                return

            # make path
            if not os.path.isdir(path):
                log.debug('Creating directory %s', path)
                try:
                    os.makedirs(path)
                except:
                    raise plugin.PluginError('Cannot create path %s' % path,
                                             log)

            # check that temp file is present
            if not os.path.exists(entry['file']):
                log.debug('entry: %s', entry)
                raise plugin.PluginWarning(
                    'Downloaded temp file `%s` doesn\'t exist!?' %
                    entry['file'])

            if config.get('filename'):
                try:
                    entry['filename'] = entry.render(config['filename'])
                    log.debug('set filename from config %s' %
                              entry['filename'])
                except RenderError as e:
                    entry.fail(
                        'Could not set filename. Error during string replacement: %s'
                        % e)
                    return
            # if we still don't have a filename, try making one from title (last resort)
            elif not entry.get('filename'):
                entry['filename'] = entry['title']
                log.debug('set filename from title %s', entry['filename'])
                if 'mime-type' not in entry:
                    log.warning(
                        'Unable to figure proper filename for %s. Using title.',
                        entry['title'])
                else:
                    guess = mimetypes.guess_extension(entry['mime-type'])
                    if not guess:
                        log.warning(
                            'Unable to guess extension with mime-type %s',
                            guess)
                    else:
                        self.filename_ext_from_mime(entry)

            name = entry.get('filename', entry['title'])
            # Remove illegal characters from filename #325, #353
            name = pathscrub(name)
            # Remove directory separators from filename #208
            name = name.replace('/', ' ')
            if sys.platform.startswith('win'):
                name = name.replace('\\', ' ')
            # remove duplicate spaces
            name = ' '.join(name.split())
            # combine to full path + filename
            destfile = os.path.join(path, name)
            log.debug('destfile: %s', destfile)

            if os.path.exists(destfile):
                import filecmp
                if filecmp.cmp(entry['file'], destfile):
                    log.debug("Identical destination file '%s' already exists",
                              destfile)
                elif config.get('overwrite'):
                    log.debug("Overwriting already existing file %s", destfile)
                else:
                    log.info(
                        'File `%s` already exists and is not identical, download failed.',
                        destfile)
                    entry.fail(
                        'File `%s` already exists and is not identical.' %
                        destfile)
                    return
            else:
                # move temp file
                log.debug('moving %s to %s', entry['file'], destfile)

                try:
                    shutil.move(entry['file'], destfile)
                except (IOError, OSError) as err:
                    # ignore permission errors, see ticket #555
                    import errno
                    if not os.path.exists(destfile):
                        raise plugin.PluginError('Unable to write %s: %s' %
                                                 (destfile, err))
                    if err.errno != errno.EPERM and err.errno != errno.EACCES:
                        raise
                else:
                    del (entry['file'])

            # store final destination as output key
            entry['location'] = destfile

        finally:
            self.cleanup_temp_file(entry)
Exemplo n.º 57
0
def guess_extension(mimetype):
    x = mimetypes.guess_extension(mimetype)
    if x == '.jpe':
        return '.jpeg'
    return x
Exemplo n.º 58
0
def process_mailbox(M):
    rv, data = M.search(None, "ALL")
    if rv != 'OK':
        print("No messages found!")
        return

    counter = 0
    for num in data[0].split():
        try:
            # time.sleep(1)
            rv, data = M.fetch(num, '(RFC822)')
            if rv != 'OK':
                print("ERROR getting message", num)
                return

            msg = email.message_from_bytes(data[0][1])
            hdr = email.header.make_header(
                email.header.decode_header(msg['Subject']))
            subject = str(hdr)
            date_tuple = email.utils.parsedate_tz(msg['Date'])
            local_date = ''
            if date_tuple:
                local_date = datetime.datetime.fromtimestamp(
                    email.utils.mktime_tz(date_tuple))
            conn = ''
            try:
                conn = MongoClient('localhost', 27017)
                print("Connected successfully!!!")
            except:
                print("Could not connect to MongoDB")

            # database
            db = conn.hash
            # Created or Switched to collection names: testGmailAnup
            collection = db.ib

            if db.ib.find({'email_timestamp': str(msg['Date'])}).count() > 0:
                continue

            # Attachment
            attachmenturl = ''
            emailbody = ''
            for part in msg.walk():
                try:
                    # multipart/* are just containers
                    if part.get_content_maintype() == 'multipart':
                        for part in msg.walk():
                            if part.get_content_type() == 'text/plain':
                                emailbody = part.get_payload(
                                )  # prints the raw text

                    filename = part.get_filename()
                    if not filename:
                        ext = mimetypes.guess_extension(
                            part.get_content_type())
                        if not ext:
                            ext = '.bin'
                        filename = 'part-%03d%s' % (counter, ext)

                    filename = str(uuid.uuid1()) + filename
                    f = open('%s/%s' % (Attachment_DIRECTORY, filename), 'wb')
                    f.write(part.get_payload(decode=True))
                    f.close()

                    if attachmenturl == '':
                        attachmenturl = Attachment_DIRECTORY + "/" + filename
                    else:
                        attachmenturl = attachmenturl + "," + Attachment_DIRECTORY + "/" + filename
                except Exception:
                    continue  # or you could use 'continue'

            timestamp = int(time.time())  # timestamp

            # From if not found
            email_sender = ''
            email_sender_id = ''
            try:
                if '<' in msg['From']:
                    email_sender = msg['From'].split('<')[0]
                    email_sender_id = msg['From'].split('<')[1].replace(
                        ">", " ")
                else:
                    email_sender_id = msg['From']
            except:
                print("")

            # To if not found
            email_recipeint = ''
            email_recipient_id = ''
            try:
                if '<' in msg['To']:
                    email_recipeint = msg['To'].split('<')[0]
                    email_recipient_id = msg['To'].split('<')[1].replace(
                        ">", " ")
                else:
                    email_recipient_id = msg['To']
            except:
                print("")

            # CC if not found
            email_recipeint_CC = ''
            email_recipient_CC_id = ''
            try:
                if '<' in msg['Cc']:
                    email_recipeint_CC = msg['Cc'].split('<')[0]
                    email_recipient_CC_id = msg['Cc'].split('<')[1].replace(
                        ">", " ")
                else:
                    email_recipient_CC_id = msg['Cc']
            except:
                print("")

            # CC if not found
            email_recipeint_CCO = ''
            email_recipient_CCO_ID = ''
            try:
                if '<' in msg['Bcc']:
                    email_recipeint_CCO = msg['Bcc'].split('<')[0]
                    email_recipient_CCO_ID = msg['Bcc'].split('<')[1].replace(
                        ">", " ")
                else:
                    email_recipient_CCO_ID = msg['Bcc']
            except:
                print("")

            emp_rec1 = {
                "tphashobject_metadata_tib":
                "8f7074d8-a520-4f7d-b2d3-09dc36acb5fd",
                "tphashobject_metadata_tib_name": "TPEMAIL",
                "tpemail_metadata_mail_box_name": "Frederico Gmail",
                "tpemail_metadata_id_mail_box": str(uuid.uuid1()),
                "tpemail_metadata_time": timestamp,
                "tpemail_metadata_time_zone": strftime("%z", gmtime()),
                "tpemail_metadata_email_subject": subject,
                "tpemail_metadata_email_sender": email_sender,
                "tpemail_metadata_email_sender_id": email_sender_id,
                "tpemail_metadata_email_recipeint": email_recipeint,
                "tpemail_metadata_email_recipient_id": email_recipient_id,
                "tpemail_metadata_email_timestamp": msg['Date'],
                "tpemail_metadata_email_header": "",
                "tpemail_metadata_email_body": emailbody,
                "tpemail_metadata_email_seq": "",
                "tpemail_metadata_email_text_content": "",
                "tpemail_metadata_email_html_content": "",
                "tpemail_metadata_email_eml_content": "",
                "tpemail_metadata_email_links": "",
                "tpemail_metadata_email_atach": attachmenturl,
                "tpemail_metadata_email_template_id": "",
                "tpemail_metadata_email_track_link": "",
                "tpemail_metadata_email_recipeint_cc": email_recipeint_CC,
                "tpemail_metadata_email_recipient_cc_id":
                email_recipient_CC_id,
                "tpemail_metadata_email_recipeint_cco": email_recipeint_CCO,
                "tpemail_metadata_email_recipient_cco_id":
                email_recipient_CCO_ID,
                "tphashobject_metadata_hash_owner_id":
                "g00zNU6n7WfhUI1u4A5ebxSN0732",
                "tpemail_metadata_hash_sender_id": "",
                "tpemail_metadata_hash_recipt_id": "",
                "tpemail_metadata_hash_sender_name": "",
                "tpemail_metadata_hash_receipt_name": "",
                "tpemail_metadata_hash_recipt_cc_id": "",
                "tpemail_metadata_hash_recipt_cco_id": "",
                "tphashobject_metadata_hub_group_id":
                "da0a7b22-fb15-46e0-9f5a-019263d79e36",
                "tphashobject_metadata_data_sinc_mongodb": "",
                "tphashobject_metadata_action": "",
                "tphashobject_metadata_role": "",
                "tphashobject_metadata_layout_role": "",
                "tphashobject_metadata_group_id":
                "9529b03b-38e8-4bdc-aa62-8055a4c36a55",
                "tpemailbox_metadata_IP_machine": host_ip
            }
            # Insert Data
            rec_id1 = collection.insert_one(emp_rec1)
            print("Data inserted with record ids", rec_id1)

            counter += 1
            print(str(counter), "]")
            print('Subject :', subject)
            print('Raw Date:', msg['Date'])
            print('From :', msg['From'].split('<')[0])
            print("")
        except Exception as e:
            print('Main Fail: ' + str(e))
            continue  # or you could use 'continue'
Exemplo n.º 59
0
     if e.errno != errno.EEXIST:
         raise
 counter = 1
 numAttachments = 0
 start = time.time()
 for part in msg.walk():
     # multipart/* are just containers
     if part.get_content_maintype() == 'multipart':
         continue
     # Applications should really sanitize the given filename so that an
     # email message can't be used to overwrite important files
     filename = part.get_filename()
     LOG(("\tFound attachment: %s" % filename), False, True)
     if not filename:
         orig = ''
         ext = mimetypes.guess_extension(part.get_content_type())
         if ((ext == '.exe') or (ext == '.pdf')):
             orig = ext
             ext = '.txt'
         if not ext:
             # Use a generic bag-of-bits extension
             ext = '.bin'
         filename = 'part-%03d%s%s' % (counter, orig, ext)
     counter += 1
     # write the file to targetFolder
     outputFilename = ("%s-%s" % (GUID, filename))
     LOG(("\tWriting as: %s" % outputFilename), False, True)
     fp = open(os.path.join(targetFolder, outputFilename), 'wb')
     fp.write(part.get_payload(decode=True))
     numAttachments += 1
     fp.close()
Exemplo n.º 60
0
    def download_past_media(self, dumper, target_id):
        """
        Downloads the past media that has already been dumped into the
        database but has not been downloaded for the given target ID yet.

        Media which formatted filename results in an already-existing file
        will be *ignored* and not re-downloaded again.
        """
        # TODO Should this respect and download only allowed media? Or all?
        target_in = self.client.get_input_entity(target_id)
        target = self.client.get_entity(target_in)
        target_id = utils.get_peer_id(target)

        msg_cursor = dumper.conn.cursor()
        msg_cursor.execute(
            'SELECT ID, Date, FromID, MediaID FROM Message '
            'WHERE ContextID = ? AND MediaID IS NOT NULL', (target_id, ))

        msg_row = msg_cursor.fetchone()
        while msg_row:
            media_row = dumper.conn.execute(
                'SELECT LocalID, VolumeID, Secret, Type, MimeType, Name '
                'FROM Media WHERE ID = ?', (msg_row[3], )).fetchone()
            # Documents have attributed and they're saved under the "document"
            # namespace so we need to split it before actually comparing.
            media_type = media_row[3].split('.')
            media_type, media_subtype = media_type[0], media_type[-1]
            if media_type not in ('photo', 'document'):
                # Only photos or documents are actually downloadable
                msg_row = msg_cursor.fetchone()
                continue

            user_row = dumper.conn.execute(
                'SELECT FirstName, LastName FROM User WHERE ID = ?',
                (msg_row[2], )).fetchone()
            if user_row:
                sender_name = '{} {}'.format(msg_row[0] or '', msg_row[1]
                                             or '').strip()
            else:
                sender_name = ''

            date = datetime.datetime.utcfromtimestamp(msg_row[1])
            formatter = defaultdict(str,
                                    id=msg_row[0],
                                    context_id=target_id,
                                    sender_id=msg_row[2] or 0,
                                    type=media_subtype or 'unknown',
                                    ext=mimetypes.guess_extension(media_row[4])
                                    or '.bin',
                                    name=utils.get_display_name(target)
                                    or 'unknown',
                                    sender_name=sender_name or 'unknown')
            if formatter['ext'] == '.jpe':
                formatter['ext'] = '.jpg'  # Nobody uses .jpe for photos

            name = None if media_subtype == 'photo' else media_row[5]
            formatter['filename'] = name or date.strftime(
                '{}_%Y-%m-%d_%H-%M-%S'.format(formatter['type']))
            filename = date.strftime(self.media_fmt).format_map(formatter)
            if not filename.endswith(formatter['ext']):
                if filename.endswith('.'):
                    filename = filename[:-1]
                filename += formatter['ext']

            if os.path.isfile(filename):
                __log__.debug('Skipping existing file %s', filename)
            else:
                __log__.info('Downloading to %s', filename)
                os.makedirs(os.path.dirname(filename), exist_ok=True)
                if media_type == 'document':
                    self.client.download_file(types.InputDocumentFileLocation(
                        id=media_row[0],
                        version=media_row[1],
                        access_hash=media_row[2]),
                                              file=filename)
                else:
                    self.client.download_file(types.InputFileLocation(
                        local_id=media_row[0],
                        volume_id=media_row[1],
                        secret=media_row[2]),
                                              file=filename)
                time.sleep(1)
            msg_row = msg_cursor.fetchone()