def get_file_location(media): """ Helper method to turn arbitrary media into (InputFileLocation, size/None). """ location = file_size = None if isinstance(media, types.MessageMediaPhoto): media = media.photo if isinstance(media, types.Photo): for size in reversed(media.sizes): if isinstance(size, types.PhotoSize): if isinstance(size.location, types.FileLocation): file_size = size.size location = size.location break elif isinstance(media, types.MessageMediaDocument): if isinstance(media.document, types.Document): file_size = media.document.size location = types.InputDocumentFileLocation( id=media.document.id, access_hash=media.document.access_hash, version=media.document.version) elif isinstance(media, (types.UserProfilePhoto, types.ChatPhoto)): if isinstance(media.photo_big, types.FileLocation): location = media.photo_big elif isinstance(media.photo_small, types.FileLocation): location = media.photo_small if isinstance(location, types.FileLocation): location = types.InputFileLocation(volume_id=location.volume_id, local_id=location.local_id, secret=location.secret) return location, file_size
def download_profile_photo(self, photo, target, known_id=None): """ Similar to Downloader.download_media() but for profile photos. Has no effect if there is no photo format (thus it is "disabled"). """ if not self.photo_fmt: return date = datetime.datetime.now() if isinstance(photo, (types.UserProfilePhoto, types.ChatPhoto)): if isinstance(photo.photo_big, types.FileLocation): location = photo.photo_big elif isinstance(photo.photo_small, types.FileLocation): location = photo.photo_small else: return elif isinstance(photo, types.Photo): for size in photo.sizes: if isinstance(size, types.PhotoSize): if isinstance(size.location, types.FileLocation): location = size.location break else: return date = photo.date if known_id is None: known_id = photo.id else: return if known_id is None: known_id = utils.get_peer_id(target) formatter = defaultdict( str, id=known_id, context_id=utils.get_peer_id(target), sender_id=utils.get_peer_id(target), ext='.jpg', type='chatphoto', filename=date.strftime('chatphoto_%Y-%m-%d_%H-%M-%S'), name=utils.get_display_name(target) or 'unknown', sender_name=utils.get_display_name(target) or 'unknown') filename = date.strftime(self.photo_fmt).format_map(formatter) if not filename.endswith(formatter['ext']): if filename.endswith('.'): filename = filename[:-1] filename += formatter['ext'] os.makedirs(os.path.dirname(filename), exist_ok=True) return self.client.download_file(types.InputFileLocation( volume_id=location.volume_id, local_id=location.local_id, secret=location.secret), file=filename, part_size_kb=256)
async def _download_media(self, media_id, context_id, sender_id, date, bar): media_row = self.dumper.conn.execute( 'SELECT LocalID, VolumeID, Secret, Type, MimeType, Name, Size ' 'FROM Media WHERE ID = ?', (media_id, )).fetchone() # Documents have attributes and they're saved under the "document" # namespace so we need to split it before actually comparing. media_type = media_row[3].split('.') media_type, media_subtype = media_type[0], media_type[-1] if media_type not in ('photo', 'document'): return # Only photos or documents are actually downloadable formatter = defaultdict(str, context_id=context_id, sender_id=sender_id, type=media_subtype or 'unknown', name=self._get_name(context_id) or 'unknown', sender_name=self._get_name(sender_id) or 'unknown') # Documents might have a filename, which may have an extension. Use # the extension from the filename if any (more accurate than mime). ext = None filename = media_row[5] if filename: filename, ext = os.path.splitext(filename) else: # No filename at all, set a sensible default filename filename = date.strftime('{}_%Y-%m-%d_%H-%M-%S'.format( formatter['type'])) # The saved media didn't have a filename and we set our own. # Detect a sensible extension from the known mimetype. if not ext: ext = export_utils.get_extension(media_row[4]) # Apply the date to the user format string and then replace the map formatter['filename'] = filename filename = date.strftime(self.media_fmt).format_map(formatter) filename += '.{}{}'.format(media_id, ext) if os.path.isfile(filename): __log__.debug('Skipping already-existing file %s', filename) return __log__.debug('Downloading to %s', filename) os.makedirs(os.path.dirname(filename), exist_ok=True) if media_type == 'document': location = types.InputDocumentFileLocation( id=media_row[0], version=media_row[1], access_hash=media_row[2]) else: location = types.InputFileLocation(local_id=media_row[0], volume_id=media_row[1], secret=media_row[2]) def progress(saved, total): """Increment the tqdm progress bar""" if total is None: # No size was found so the bar total wasn't incremented before bar.total += saved bar.update(saved) elif saved == total: # Downloaded the last bit (which is probably <> part size) mod = (saved % DOWNLOAD_PART_SIZE) or DOWNLOAD_PART_SIZE bar.update(mod) else: # All chunks are of the same size and this isn't the last one bar.update(DOWNLOAD_PART_SIZE) if media_row[6] is not None: bar.total += media_row[6] self._incomplete_download = filename await self.client.download_file(location, file=filename, file_size=media_row[6], part_size_kb=DOWNLOAD_PART_SIZE // 1024, progress_callback=progress) self._incomplete_download = None
async def _download_media(self, media_id, context_id, sender_id, date, progress): media_row = await db_media.find_one({'_id': media_id}) progress.name = media_row['name'] if media_row['size']: if media_row['size'] > self.max_size: logger.warning('忽略过大文件:%s', media_row['name']) return # 忽略过大的文件 if media_row['size'] < self.min_size: logger.warning('忽略过小文件:%s', media_row['name']) return # 忽略过小的文件 # Documents have attributes and they're saved under the "document" # namespace so we need to split it before actually comparing. media_type = media_row['type'].split('.') media_type, media_subtype = media_type[0], media_type[-1] if media_type not in ('document', 'photo'): logger.info('忽略文档类型:%s', media_type) return # Only photos or documents are actually downloadable formatter = defaultdict(str, context_id=context_id, sender_id=sender_id, type=media_subtype or 'unknown', name=await self._get_name(context_id) or 'unknown', sender_name=await self._get_name(sender_id) or 'unknown') # Documents might have a filename, which may have an extension. Use # the extension from the filename if any (more accurate than mime). ext = None filename = media_row['name'] if filename: filename, ext = os.path.splitext(filename) else: # No filename at all, set a sensible default filename filename = arrow.get(date).format('YYYYMMDDHHmmssSSS') logger.debug('忽略无名称文件') return # The saved media didn't have a filename and we set our own. # Detect a sensible extension from the known mimetype. if not ext: ext = export_utils.get_extension(media_row['mime_type']) # Apply the date to the user format string and then replace the map formatter['filename'] = fix_windows_filename(filename) filename = date.strftime(self.media_fmt).format_map(formatter) filename += '.{}{}'.format(media_id, ext) if os.path.isfile(filename): logger.debug('Skipping already-existing file %s', filename) return logger.info('正在下载:%s 至 %s', media_type, filename) os.makedirs(os.path.dirname(filename), exist_ok=True) if media_type == 'document': location = types.InputDocumentFileLocation( id=media_row['local_id'], version=media_row['volume_id'], access_hash=media_row['secret']) else: location = types.InputFileLocation( local_id=media_row['local_id'], volume_id=media_row['volume_id'], secret=media_row['secret']) def progress_callback(saved, total): """Increment the tqdm progress bar""" if total is None: # No size was found so the bar total wasn't incremented before progress.total += saved progress.inc(saved) elif saved == total: # Downloaded the last bit (which is probably <> part size) mod = (saved % DOWNLOAD_PART_SIZE) or DOWNLOAD_PART_SIZE progress.inc(mod) else: # All chunks are of the same size and this isn't the last one progress.inc(DOWNLOAD_PART_SIZE) if media_row['size'] is not None: progress.total += media_row['size'] self._incomplete_download = filename await self.client.download_file(location, file=filename, file_size=media_row['size'], part_size_kb=DOWNLOAD_PART_SIZE // 1024, progress_callback=progress_callback) self._incomplete_download = None
def download_past_media(self, dumper, target_id): """ Downloads the past media that has already been dumped into the database but has not been downloaded for the given target ID yet. Media which formatted filename results in an already-existing file will be *ignored* and not re-downloaded again. """ # TODO Should this respect and download only allowed media? Or all? target_in = self.client.get_input_entity(target_id) target = self.client.get_entity(target_in) target_id = utils.get_peer_id(target) msg_cursor = dumper.conn.cursor() msg_cursor.execute( 'SELECT ID, Date, FromID, MediaID FROM Message ' 'WHERE ContextID = ? AND MediaID IS NOT NULL', (target_id, )) msg_row = msg_cursor.fetchone() while msg_row: media_row = dumper.conn.execute( 'SELECT LocalID, VolumeID, Secret, Type, MimeType, Name ' 'FROM Media WHERE ID = ?', (msg_row[3], )).fetchone() # Documents have attributed and they're saved under the "document" # namespace so we need to split it before actually comparing. media_type = media_row[3].split('.') media_type, media_subtype = media_type[0], media_type[-1] if media_type not in ('photo', 'document'): # Only photos or documents are actually downloadable msg_row = msg_cursor.fetchone() continue user_row = dumper.conn.execute( 'SELECT FirstName, LastName FROM User WHERE ID = ?', (msg_row[2], )).fetchone() if user_row: sender_name = '{} {}'.format(msg_row[0] or '', msg_row[1] or '').strip() else: sender_name = '' date = datetime.datetime.utcfromtimestamp(msg_row[1]) formatter = defaultdict(str, id=msg_row[0], context_id=target_id, sender_id=msg_row[2] or 0, type=media_subtype or 'unknown', ext=mimetypes.guess_extension(media_row[4]) or '.bin', name=utils.get_display_name(target) or 'unknown', sender_name=sender_name or 'unknown') if formatter['ext'] == '.jpe': formatter['ext'] = '.jpg' # Nobody uses .jpe for photos name = None if media_subtype == 'photo' else media_row[5] formatter['filename'] = name or date.strftime( '{}_%Y-%m-%d_%H-%M-%S'.format(formatter['type'])) filename = date.strftime(self.media_fmt).format_map(formatter) if not filename.endswith(formatter['ext']): if filename.endswith('.'): filename = filename[:-1] filename += formatter['ext'] if os.path.isfile(filename): __log__.debug('Skipping existing file %s', filename) else: __log__.info('Downloading to %s', filename) os.makedirs(os.path.dirname(filename), exist_ok=True) if media_type == 'document': self.client.download_file(types.InputDocumentFileLocation( id=media_row[0], version=media_row[1], access_hash=media_row[2]), file=filename) else: self.client.download_file(types.InputFileLocation( local_id=media_row[0], volume_id=media_row[1], secret=media_row[2]), file=filename) time.sleep(1) msg_row = msg_cursor.fetchone()