Пример #1
0
    def __init__(self,
                 client,
                 entity,
                 download_delay=1,
                 download_chunk_size=100):
        """
        :param client:              An initialized TelegramClient, which will be used to download the messages
        :param entity:              The entity (user, chat or channel) from which the backup will be made
        :param download_delay:      The download delay, in seconds, after a message chunk is downloaded
        :param download_chunk_size: The chunk size (i.e. how many messages do we download every time)
                                    The maximum allowed by Telegram is 100
        """
        self.client = client
        self.entity = entity

        self.download_delay = download_delay
        self.download_chunk_size = download_chunk_size

        self.backup_dir = path.join(Backuper.backups_dir, str(entity.id))
        self.media_handler = MediaHandler(self.backup_dir)

        # Open and close the database to create the require directories
        TLDatabase(self.backup_dir).close()

        # Set up all the directories and files that we'll be needing
        self.files = {
            'entity': path.join(self.backup_dir, 'entity.tlo'),
            'metadata': path.join(self.backup_dir, 'metadata.json')
        }
        # TODO Crashes if the other user got us blocked (AttributeError: 'NoneType' object has no attribute 'photo_big')

        # Is the backup running (are messages being downloaded?)
        self.backup_running = False

        # Event that gets fired when metadata is saved
        self.on_metadata_change = None

        # Save the entity and load the metadata
        with open(self.files['entity'], 'wb') as file:
            with BinaryWriter(file) as writer:
                entity.on_send(writer)
        self.metadata = self.load_metadata()
Пример #2
0
    def __init__(self, client, entity,
                 download_delay=1,
                 download_chunk_size=100):
        """
        :param client:              An initialized TelegramClient, which will be used to download the messages
        :param entity:              The entity (user, chat or channel) from which the backup will be made
        :param download_delay:      The download delay, in seconds, after a message chunk is downloaded
        :param download_chunk_size: The chunk size (i.e. how many messages do we download every time)
                                    The maximum allowed by Telegram is 100
        """
        self.client = client
        self.entity = entity

        self.download_delay = download_delay
        self.download_chunk_size = download_chunk_size

        self.backup_dir = path.join(Backuper.backups_dir, str(entity.id))
        self.media_handler = MediaHandler(self.backup_dir)

        # Open and close the database to create the require directories
        TLDatabase(self.backup_dir).close()

        # Set up all the directories and files that we'll be needing
        self.files = {
            'entity': path.join(self.backup_dir, 'entity.tlo'),
            'metadata': path.join(self.backup_dir, 'metadata.json')
        }
        # TODO Crashes if the other user got us blocked (AttributeError: 'NoneType' object has no attribute 'photo_big')

        # Is the backup running (are messages being downloaded?)
        self.backup_running = False

        # Event that gets fired when metadata is saved
        self.on_metadata_change = None

        # Save the entity and load the metadata
        with open(self.files['entity'], 'wb') as file:
            with BinaryWriter(file) as writer:
                entity.on_send(writer)
        self.metadata = self.load_metadata()
Пример #3
0
class Backuper:
    # Default output directory for all the made backups
    backups_dir = 'backups'

    #region Initialize

    def __init__(self, client, entity,
                 download_delay=1,
                 download_chunk_size=100):
        """
        :param client:              An initialized TelegramClient, which will be used to download the messages
        :param entity:              The entity (user, chat or channel) from which the backup will be made
        :param download_delay:      The download delay, in seconds, after a message chunk is downloaded
        :param download_chunk_size: The chunk size (i.e. how many messages do we download every time)
                                    The maximum allowed by Telegram is 100
        """
        self.client = client
        self.entity = entity

        self.download_delay = download_delay
        self.download_chunk_size = download_chunk_size

        self.backup_dir = path.join(Backuper.backups_dir, str(entity.id))
        self.media_handler = MediaHandler(self.backup_dir)

        # Open and close the database to create the require directories
        TLDatabase(self.backup_dir).close()

        # Set up all the directories and files that we'll be needing
        self.files = {
            'entity': path.join(self.backup_dir, 'entity.tlo'),
            'metadata': path.join(self.backup_dir, 'metadata.json')
        }
        # TODO Crashes if the other user got us blocked (AttributeError: 'NoneType' object has no attribute 'photo_big')

        # Is the backup running (are messages being downloaded?)
        self.backup_running = False

        # Event that gets fired when metadata is saved
        self.on_metadata_change = None

        # Save the entity and load the metadata
        with open(self.files['entity'], 'wb') as file:
            with BinaryWriter(file) as writer:
                entity.on_send(writer)
        self.metadata = self.load_metadata()

    #endregion

    #region Metadata handling

    def save_metadata(self):
        """Saves the metadata for the current entity"""
        with open(self.files['metadata'], 'w') as file:
            json.dump(self.metadata, file)

        if self.on_metadata_change:
            self.on_metadata_change()

    def load_metadata(self):
        """Loads the metadata of the current entity"""
        if not path.isfile(self.files['metadata']):
            return {
                'resume_msg_id': 0,
                'saved_msgs': 0,
                'total_msgs': 0,
                'etl': '???',
                'scheme_layer': scheme_layer
            }
        else:
            with open(self.files['metadata'], 'r') as file:
                return json.load(file)

    def update_total_messages(self):
        """Updates the total messages with the current peer"""

        result = self.client.invoke(GetHistoryRequest(
            peer=get_input_peer(self.entity),
            # No offset, we simply want the total messages count
            offset_id=0, limit=0, offset_date=None,
            add_offset=0, max_id=0, min_id=0
        ))
        self.metadata['total_msgs'] = getattr(result, 'count', len(result.messages))
        self.metadata['etl'] = str(self.calculate_etl(
            self.metadata['saved_msgs'], self.metadata['total_msgs']))

        self.save_metadata()

    #endregion

    #region Backups listing

    @staticmethod
    def enumerate_backups_entities():
        """Enumerates the entities of all the available backups"""
        if isdir(Backuper.backups_dir):

            # Look for subdirectories
            for directory in listdir(Backuper.backups_dir):
                entity_file = path.join(Backuper.backups_dir, directory, 'entity.tlo')

                # Ensure the entity.pickle file exists
                if isfile(entity_file):

                    # Load and yield it
                    with open(entity_file, 'rb') as file:
                        with BinaryReader(stream=file) as reader:
                            yield reader.tgread_object()

    #endregion

    #region Backup exists and deletion

    @staticmethod
    def exists_backup(entity_id):
        return isdir(path.join(Backuper.backups_dir, str(entity_id)))

    def delete_backup(self):
        """Deletes the backup with the current peer from disk and sets
           everything to None (the backup becomes unusable)"""
        shutil.rmtree(self.backup_dir)

    #endregion

    #region Backups generation

    def start_backup(self):
        """Begins the backup on the given peer"""
        Thread(target=self.backup_messages_thread).start()

    def start_media_backup(self, **kwargs):
        """Begins the media backup on the given peer.
           The valid named arguments are:

           dl_propics: Boolean value determining whether profile pictures should be downloaded
           dl_photos: Boolean value determining whether photos should be downloaded
           dl_docs: Boolean value determining whether documents (and gifs, and stickers) should be downloaded

           docs_max_size: If specified, determines the maximum document size allowed in bytes
           after_date: If specified, only media after this date will be downloaded
           before_date: If specified, only media before this date will be downloaded

           progress_callback: If specified, current download progress will be reported here
                              invoking progress_callback(saved bytes, total bytes, estimated time left)"""
        Thread(target=self.backup_media_thread, kwargs=kwargs).start()

    def stop_backup(self):
        """Stops the backup (either messages or media) on the given peer"""
        self.backup_running = False

    #region Messages backup

    def backup_messages_thread(self):
        """This method backups the messages and should be ran in a different thread"""
        self.backup_running = True

        # Create a connection to the database
        db = TLDatabase(self.backup_dir)

        # Determine whether we started making the backup from the very first message or not.
        # If this is the case:
        #   We won't need to come back to the first message again after we've finished downloading
        #   them all, since that first message will already be in backup.
        #
        # Otherwise, if we did not start from the first message:
        #   More messages were in the backup already, and after we backup those "left" ones,
        #   we must return to the first message and backup until where we started.
        started_at_0 = self.metadata['resume_msg_id'] == 0

        # Keep an internal downloaded count for it to be faster
        # (instead of querying the database all the time)
        self.metadata['saved_msgs'] = db.count('messages')

        # We also need to keep track of how many messages we've downloaded now
        # in order to calculate the estimated time left properly
        saved_msgs_now = 0

        # Make the backup
        try:
            # We need this to invoke GetHistoryRequest
            input_peer = get_input_peer(self.entity)

            # Keep track from when we started to determine the estimated time left
            start = datetime.now()

            # Enter the download-messages main loop
            while self.backup_running:
                # Invoke the GetHistoryRequest to get the next messages after those we have
                result = self.client.invoke(GetHistoryRequest(
                    peer=input_peer,
                    offset_id=self.metadata['resume_msg_id'],
                    limit=self.download_chunk_size,
                    offset_date=None,
                    add_offset=0,
                    max_id=0,
                    min_id=0
                ))
                self.metadata['total_msgs'] = getattr(result, 'count', len(result.messages))

                # First add users and chats, replacing any previous value
                for user in result.users:
                    db.add_object(user, replace=True)
                for chat in result.chats:
                    db.add_object(chat, replace=True)

                # Then add the messages to the backup
                for msg in result.messages:
                    if db.in_table(msg.id, 'messages'):
                        # If the message we retrieved was already saved, this means that we're
                        # done because we have the rest of the messages.
                        # Clear the list so we enter the next if, and break to early terminate
                        self.metadata['resume_msg_id'] = result.messages[-1].id
                        del result.messages[:]
                        break
                    else:
                        db.add_object(msg)
                        saved_msgs_now += 1
                        self.metadata['saved_msgs'] += 1
                        self.metadata['resume_msg_id'] = msg.id

                self.metadata['etl'] = str(self.calculate_etl(
                    saved_msgs_now, self.metadata['total_msgs'],
                    start=start))

                # Always commit at the end to save changes
                db.commit()
                self.save_metadata()

                # The list can be empty because we've either used a too big offset
                # (in which case we have all the previous messages), or we've reached
                # a point where we have the upcoming messages (so there's no need to
                # download them again and we stopped)
                if not result.messages:
                    # We've downloaded all the messages since the last backup
                    if started_at_0:
                        # And since we started from the very first message, we have them all
                        print('Downloaded all {}'.format(self.metadata['total_msgs']))
                        break
                    else:
                        # We need to start from the first message (latest sent message)
                        # and backup again until we have them all
                        self.metadata['resume_msg_id'] = 0
                        started_at_0 = True

                # Always sleep a bit, or Telegram will get angry and tell us to chill
                sleep(self.download_delay)

            pass  # end while

        except KeyboardInterrupt:
            print('Operation cancelled, not downloading more messages!')
            # Also commit here, we don't want to lose any information!
            db.commit()
            self.save_metadata()

        finally:
            self.backup_running = False

    #endregion

    #region Media backups

    def backup_propic(self):
        """Backups the profile picture for the given
           entity as the current peer profile picture, returning its path"""

        # Allow multiple versions of the profile picture
        # TODO Maybe this should be another method, because when downloading media... We also have multiple versions
        filename = self.media_handler.get_propic_path(self.entity, allow_multiple=True)
        generic_filename = self.media_handler.get_propic_path(self.entity)
        if filename:  # User may not have a profile picture
            if not isfile(filename):
                # Only download the file if it doesn't exist yet
                self.client.download_profile_photo(self.entity.photo,
                                                   file_path=filename,
                                                   add_extension=False)
                # If we downloaded a new version, copy it to the "default" generic file
                if isfile(generic_filename):
                    remove(generic_filename)
                shutil.copy(filename, generic_filename)

            # The user may not have a profile picture
            return generic_filename

    def calculate_download_size(self, dl_propics, dl_photos, dl_docs,
                                docs_max_size=None, before_date=None, after_date=None):
        """Estimates the download size, given some parameters"""
        with TLDatabase(self.backup_dir) as db:
            total_size = 0

            # TODO How does Telegram Desktop find out the profile photo size?
            if dl_propics:
                total_size += db.count('users where photo not null') * AVERAGE_PROPIC_SIZE

            if dl_photos:
                for msg in db.query_messages(self.get_query(MessageMediaPhoto, before_date, after_date)):
                    total_size += msg.media.photo.sizes[-1].size

            if dl_docs:
                for msg in db.query_messages(self.get_query(MessageMediaDocument, before_date, after_date)):
                    if not docs_max_size or msg.media.document.size <= docs_max_size:
                        total_size += msg.media.document.size

            return total_size

    def backup_media_thread(self, dl_propics, dl_photos, dl_docs,
                            docs_max_size=None, before_date=None, after_date=None,
                            progress_callback=None):
        """Backups the specified media contained in the given database file"""
        self.backup_running = True

        # Create a connection to the database
        db = TLDatabase(self.backup_dir)

        # Store how many bytes we have/how many bytes there are in total
        current = 0
        total = self.calculate_download_size(dl_propics, dl_photos, dl_docs,
                                             docs_max_size, after_date, before_date)

        # Keep track from when we started to determine the estimated time left
        start = datetime.now()

        if dl_propics:
            # TODO Also query chats and channels
            for user in db.query_users('where photo not null'):
                if not self.backup_running:
                    return
                # Try downloading the photo
                output = self.media_handler.get_propic_path(user)
                try:
                    if not self.valid_file_exists(output):
                        self.client.download_profile_photo(
                            user.photo, add_extension=False, file_path=output)
                        sleep(self.download_delay)

                except RPCError as e:
                    print('Error downloading profile photo:', e)
                finally:
                    current += AVERAGE_PROPIC_SIZE
                    if progress_callback:
                        progress_callback(current, total, self.calculate_etl(current, total, start))

        if dl_photos:
            for msg in db.query_messages(self.get_query(MessageMediaPhoto, before_date, after_date)):
                if not self.backup_running:
                    return
                # Try downloading the photo
                output = self.media_handler.get_msg_media_path(msg)
                try:
                    if not self.valid_file_exists(output):
                        self.client.download_msg_media(
                            msg.media, add_extension=False, file_path=output)
                        sleep(self.download_delay)

                except RPCError as e:
                    print('Error downloading photo:', e)
                finally:
                    current += msg.media.photo.sizes[-1].size
                    if progress_callback:
                        progress_callback(current, total, self.calculate_etl(current, total, start))

        # TODO Add an internal callback to determine how the current document download is going,
        # and update our currently saved bytes count based on that
        if dl_docs:
            for msg in db.query_messages(self.get_query(MessageMediaDocument, before_date, after_date)):
                if not self.backup_running:
                    return

                if not docs_max_size or msg.media.document.size <= docs_max_size:
                    # Try downloading the document
                    output = self.media_handler.get_msg_media_path(msg)
                    try:
                        if not self.valid_file_exists(output):
                            self.client.download_msg_media(
                                msg.media, add_extension=False, file_path=output)
                        sleep(self.download_delay)

                    except RPCError as e:
                        print('Error downloading document:', e)
                    finally:
                        current += msg.media.document.size
                        if progress_callback:
                            progress_callback(current, total, self.calculate_etl(current, total, start))
        db.close()

    #endregion

    #endregion

    #region Utilities

    def calculate_etl(self, downloaded, total, start=None):
        """Calculates the estimated time left, based on how long it took us
           to reach "downloaded" and how many messages we have left.

           If no start time is given, the time will simply by estimated by how
           many chunks are left, which will NOT work if what is being downloaded is media"""
        left = total - downloaded
        if not start:
            # We add chunk size - 1 because division will truncate the decimal places,
            # so for example, if we had a chunk size of 8:
            #   7 messages + 7 = 14 -> 14 // 8 = 1 chunk download required
            #   8 messages + 7 = 15 -> 15 // 8 = 1 chunk download required
            #   9 messages + 7 = 16 -> 16 // 8 = 2 chunks download required
            #
            # Clearly, both 7 and 8 fit in one chunk, but 9 doesn't.
            chunks_left = (left + self.download_chunk_size - 1) // self.download_chunk_size
            etl = chunks_left * self.download_delay
        else:
            if downloaded:
                delta_time = (datetime.now() - start).total_seconds() / downloaded
                etl = left * delta_time
            else:
                etl = 0

        return timedelta(seconds=round(etl, 1))

    @staticmethod
    def get_query(clazz, before_date=None, after_date=None):
        """Returns a database query filtering by media_id (its class),
           and optionally range dates"""
        filters = 'where media_id = {}'.format(clazz.constructor_id)
        if before_date:
            filters += " and date <= '{}'".format(before_date)
        if after_date:
            filters += " and date >= '{}'".format(after_date)
        return filters

    @staticmethod
    def valid_file_exists(file):
        """Determines whether a file exists and its "valid"
           (i.e., the file size is greater than 0; if it's 0, it probably faild dueto an RPC error)"""
        return path.isfile(file) and path.getsize(file) > 0
Пример #4
0
    def export_thread(self, callback):
        """The exporting a conversation method (should be ran in a different thread)"""

        with TLDatabase(self.backups_dir) as db:
            db_media_handler = MediaHandler(self.backups_dir)

            # First copy the default media files
            self.copy_default_media()

            progress = {
                'exported': 0,
                'total': db.count('messages'),
                'etl': 'Unknown'
            }

            # The first date will obviously be the first day
            # TODO This fails if there are 0 messages in the database, export should be disabled!
            previous_date = self.get_message_date(db.query_message('order by id asc'))

            # Also find the next day
            following_date = self.get_previous_and_next_day(db, previous_date)[1]

            # Set the first writer (which will have the "previous" date, the first one)
            writer = HTMLTLWriter(previous_date, self.media_handler,
                                  following_date=following_date)

            # Keep track from when we started to determine the estimated time left
            start = datetime.now()

            # Export the profile photos, from users chats and channels
            # TODO This should also have a progress if we have a backup of thousands of files!
            for user in db.query_users():
                if user.photo:
                    source = db_media_handler.get_propic_path(user)
                    output = self.media_handler.get_propic_path(user)
                    if isfile(source):
                        copyfile(source, output)

            # Iterate over all the messages to export them in their respective days
            for msg in db.query_messages('order by id asc'):
                msg_date = self.get_message_date(msg)
                progress['exported'] += 1

                # As soon as we're in the next day, update the output the writer
                if msg_date != previous_date:
                    # Exit the previous writer to end the header
                    writer.__exit__(None, None, None)

                    # Update date values and create a new instance
                    previous_date, following_date =\
                        self.get_previous_and_next_day(db, msg_date)

                    writer = HTMLTLWriter(msg_date, self.media_handler,
                                          previous_date=previous_date,
                                          following_date=following_date)
                    # Call the callback
                    if callback:
                        progress['etl'] = self.calculate_etl(start, progress['exported'], progress['total'])
                        callback(progress)
                    else:
                        print(progress)

                writer.write_message(msg, db)
                # If the message has media, we need to copy it so it's accessible by the exported HTML
                if not isinstance(msg, MessageService) and msg.media:
                    source = db_media_handler.get_msg_media_path(msg)
                    output = self.media_handler.get_msg_media_path(msg)
                    # Source may be None if the media is unsupported (i.e. a webpage)
                    if source and isfile(source):
                        copyfile(source, output)

                previous_date = msg_date

            # Always exit at the end
            writer.__exit__(None, None, None)
            # Call the callback to notify we've finished
            if callback:
                progress['etl'] = timedelta(seconds=0)
                callback(progress)
Пример #5
0
class Backuper:
    # Default output directory for all the made backups
    backups_dir = 'backups'

    #region Initialize

    def __init__(self,
                 client,
                 entity,
                 download_delay=1,
                 download_chunk_size=100):
        """
        :param client:              An initialized TelegramClient, which will be used to download the messages
        :param entity:              The entity (user, chat or channel) from which the backup will be made
        :param download_delay:      The download delay, in seconds, after a message chunk is downloaded
        :param download_chunk_size: The chunk size (i.e. how many messages do we download every time)
                                    The maximum allowed by Telegram is 100
        """
        self.client = client
        self.entity = entity

        self.download_delay = download_delay
        self.download_chunk_size = download_chunk_size

        self.backup_dir = path.join(Backuper.backups_dir, str(entity.id))
        self.media_handler = MediaHandler(self.backup_dir)

        # Open and close the database to create the require directories
        TLDatabase(self.backup_dir).close()

        # Set up all the directories and files that we'll be needing
        self.files = {
            'entity': path.join(self.backup_dir, 'entity.tlo'),
            'metadata': path.join(self.backup_dir, 'metadata.json')
        }
        # TODO Crashes if the other user got us blocked (AttributeError: 'NoneType' object has no attribute 'photo_big')

        # Is the backup running (are messages being downloaded?)
        self.backup_running = False

        # Event that gets fired when metadata is saved
        self.on_metadata_change = None

        # Save the entity and load the metadata
        with open(self.files['entity'], 'wb') as file:
            with BinaryWriter(file) as writer:
                entity.on_send(writer)
        self.metadata = self.load_metadata()

    #endregion

    #region Metadata handling

    def save_metadata(self):
        """Saves the metadata for the current entity"""
        with open(self.files['metadata'], 'w', encoding='utf-8') as file:
            json.dump(self.metadata, file)

        if self.on_metadata_change:
            self.on_metadata_change()

    def load_metadata(self):
        """Loads the metadata of the current entity"""
        if not path.isfile(self.files['metadata']):
            return {
                'resume_msg_id': 0,
                'saved_msgs': 0,
                'total_msgs': 0,
                'etl': '???',
                'scheme_layer': scheme_layer
            }
        else:
            with open(self.files['metadata'], 'r', encoding='utf-8') as file:
                return json.load(file)

    def update_total_messages(self):
        """Updates the total messages with the current peer"""

        result = self.client.invoke(
            GetHistoryRequest(
                peer=self.entity,
                # No offset, we simply want the total messages count
                offset_id=0,
                limit=0,
                offset_date=None,
                add_offset=0,
                max_id=0,
                min_id=0))
        self.metadata['total_msgs'] = getattr(result, 'count',
                                              len(result.messages))
        self.metadata['etl'] = str(
            self.calculate_etl(self.metadata['saved_msgs'],
                               self.metadata['total_msgs']))

        self.save_metadata()

    #endregion

    #region Backups listing

    @staticmethod
    def enumerate_backups_entities():
        """Enumerates the entities of all the available backups"""
        if isdir(Backuper.backups_dir):

            # Look for subdirectories
            for directory in listdir(Backuper.backups_dir):
                entity_file = path.join(Backuper.backups_dir, directory,
                                        'entity.tlo')

                # Ensure the entity.pickle file exists
                if isfile(entity_file):

                    # Load and yield it
                    with open(entity_file, 'rb') as file:
                        with BinaryReader(stream=file) as reader:
                            try:
                                yield reader.tgread_object()
                            except TypeNotFoundError:
                                # Old user, scheme got updated, don't care.
                                pass

    #endregion

    #region Backup exists and deletion

    @staticmethod
    def exists_backup(entity_id):
        return isdir(path.join(Backuper.backups_dir, str(entity_id)))

    def delete_backup(self):
        """Deletes the backup with the current peer from disk and sets
           everything to None (the backup becomes unusable)"""
        shutil.rmtree(self.backup_dir)

    #endregion

    #region Backups generation

    def start_backup(self):
        """Begins the backup on the given peer"""
        Thread(target=self.backup_messages_thread).start()

    def start_media_backup(self, **kwargs):
        """Begins the media backup on the given peer.
           The valid named arguments are:

           dl_propics: Boolean value determining whether profile pictures should be downloaded
           dl_photos: Boolean value determining whether photos should be downloaded
           dl_docs: Boolean value determining whether documents (and gifs, and stickers) should be downloaded

           docs_max_size: If specified, determines the maximum document size allowed in bytes
           after_date: If specified, only media after this date will be downloaded
           before_date: If specified, only media before this date will be downloaded

           progress_callback: If specified, current download progress will be reported here
                              invoking progress_callback(saved bytes, total bytes, estimated time left)"""
        Thread(target=self.backup_media_thread, kwargs=kwargs).start()

    def stop_backup(self):
        """Stops the backup (either messages or media) on the given peer"""
        self.backup_running = False

    #region Messages backup

    def backup_messages_thread(self):
        """This method backups the messages and should be ran in a different thread"""
        self.backup_running = True

        # Create a connection to the database
        db = TLDatabase(self.backup_dir)

        # Determine whether we started making the backup from the very first message or not.
        # If this is the case:
        #   We won't need to come back to the first message again after we've finished downloading
        #   them all, since that first message will already be in backup.
        #
        # Otherwise, if we did not start from the first message:
        #   More messages were in the backup already, and after we backup those "left" ones,
        #   we must return to the first message and backup until where we started.
        started_at_0 = self.metadata['resume_msg_id'] == 0

        # Keep an internal downloaded count for it to be faster
        # (instead of querying the database all the time)
        self.metadata['saved_msgs'] = db.count('messages')

        # We also need to keep track of how many messages we've downloaded now
        # in order to calculate the estimated time left properly
        saved_msgs_now = 0

        # Make the backup
        try:
            # We need this to invoke GetHistoryRequest
            input_peer = self.entity

            # Keep track from when we started to determine the estimated time left
            start = datetime.now()

            # Enter the download-messages main loop
            self.client.connect()
            while self.backup_running:
                # Invoke the GetHistoryRequest to get the next messages after those we have
                result = self.client.invoke(
                    GetHistoryRequest(peer=input_peer,
                                      offset_id=self.metadata['resume_msg_id'],
                                      limit=self.download_chunk_size,
                                      offset_date=None,
                                      add_offset=0,
                                      max_id=0,
                                      min_id=0))
                # For some strange reason, GetHistoryRequest might return upload.file.File
                # Ensure we retrieved Messages or MessagesSlice
                if not isinstance(result, Messages) and not isinstance(result, MessagesSlice) \
                        and not isinstance(result, ChannelMessages):
                    print('Invalid result type when downloading messages:',
                          type(result))
                    sleep(self.download_delay)
                    continue

                self.metadata['total_msgs'] = getattr(result, 'count',
                                                      len(result.messages))

                # First add users and chats, replacing any previous value
                for user in result.users:
                    db.add_object(user, replace=True)
                for chat in result.chats:
                    db.add_object(chat, replace=True)

                # Then add the messages to the backup
                for msg in result.messages:
                    if db.in_table(msg.id, 'messages'):
                        # If the message we retrieved was already saved, this means that we're
                        # done because we have the rest of the messages.
                        # Clear the list so we enter the next if, and break to early terminate
                        self.metadata['resume_msg_id'] = result.messages[-1].id
                        del result.messages[:]
                        break
                    else:
                        db.add_object(msg)
                        saved_msgs_now += 1
                        self.metadata['saved_msgs'] += 1
                        self.metadata['resume_msg_id'] = msg.id

                self.metadata['etl'] = str(
                    self.calculate_etl(saved_msgs_now,
                                       self.metadata['total_msgs'],
                                       start=start))

                # Always commit at the end to save changes
                db.commit()
                self.save_metadata()

                # The list can be empty because we've either used a too big offset
                # (in which case we have all the previous messages), or we've reached
                # a point where we have the upcoming messages (so there's no need to
                # download them again and we stopped)
                if not result.messages:
                    # We've downloaded all the messages since the last backup
                    if started_at_0:
                        # And since we started from the very first message, we have them all
                        print('Downloaded all {}'.format(
                            self.metadata['total_msgs']))
                        break
                    else:
                        # We need to start from the first message (latest sent message)
                        # and backup again until we have them all
                        self.metadata['resume_msg_id'] = 0
                        started_at_0 = True

                # Always sleep a bit, or Telegram will get angry and tell us to chill
                sleep(self.download_delay)

            pass  # end while

        except KeyboardInterrupt:
            print('Operation cancelled, not downloading more messages!')
            # Also commit here, we don't want to lose any information!
            db.commit()
            self.save_metadata()

        finally:
            self.backup_running = False

    #endregion

    #region Media backups

    def backup_propic(self):
        """Backups the profile picture for the given
           entity as the current peer profile picture, returning its path"""

        # Allow multiple versions of the profile picture
        # TODO Maybe this should be another method, because when downloading media... We also have multiple versions
        filename = self.media_handler.get_propic_path(self.entity,
                                                      allow_multiple=True)
        generic_filename = self.media_handler.get_propic_path(self.entity)
        if filename:  # User may not have a profile picture
            if not isfile(filename):
                # Only download the file if it doesn't exist yet
                self.client.download_profile_photo(self.entity.photo,
                                                   file_path=filename,
                                                   add_extension=False)
                # If we downloaded a new version, copy it to the "default" generic file
                if isfile(generic_filename):
                    remove(generic_filename)
                shutil.copy(filename, generic_filename)

            # The user may not have a profile picture
            return generic_filename

    def calculate_download_size(self,
                                dl_propics,
                                dl_photos,
                                dl_docs,
                                docs_max_size=None,
                                before_date=None,
                                after_date=None):
        """Estimates the download size, given some parameters"""
        with TLDatabase(self.backup_dir) as db:
            total_size = 0

            # TODO How does Telegram Desktop find out the profile photo size?
            if dl_propics:
                total_size += db.count(
                    'users where photo not null') * AVERAGE_PROPIC_SIZE

            if dl_photos:
                for msg in db.query_messages(
                        self.get_query(MessageMediaPhoto, before_date,
                                       after_date)):
                    total_size += msg.media.photo.sizes[-1].size

            if dl_docs:
                for msg in db.query_messages(
                        self.get_query(MessageMediaDocument, before_date,
                                       after_date)):
                    if not docs_max_size or msg.media.document.size <= docs_max_size:
                        total_size += msg.media.document.size

            return total_size

    def backup_media_thread(self,
                            dl_propics,
                            dl_photos,
                            dl_docs,
                            docs_max_size=None,
                            before_date=None,
                            after_date=None,
                            progress_callback=None):
        """Backups the specified media contained in the given database file"""
        self.backup_running = True

        # Create a connection to the database
        db = TLDatabase(self.backup_dir)

        # Store how many bytes we have/how many bytes there are in total
        current = 0
        total = self.calculate_download_size(dl_propics, dl_photos, dl_docs,
                                             docs_max_size, after_date,
                                             before_date)

        # Keep track from when we started to determine the estimated time left
        start = datetime.now()

        if dl_propics:
            # TODO Also query chats and channels
            for user in db.query_users('where photo not null'):
                if not self.backup_running:
                    return
                # Try downloading the photo
                output = self.media_handler.get_propic_path(user)
                try:
                    if not self.valid_file_exists(output):
                        self.client.download_profile_photo(user.photo,
                                                           add_extension=False,
                                                           file_path=output)
                        sleep(self.download_delay)

                except RPCError as e:
                    print('Error downloading profile photo:', e)
                finally:
                    current += AVERAGE_PROPIC_SIZE
                    if progress_callback:
                        progress_callback(
                            current, total,
                            self.calculate_etl(current, total, start))

        if dl_photos:
            for msg in db.query_messages(
                    self.get_query(MessageMediaPhoto, before_date,
                                   after_date)):
                if not self.backup_running:
                    return
                # Try downloading the photo
                output = self.media_handler.get_msg_media_path(msg)
                try:
                    if not self.valid_file_exists(output):
                        self.client.download_msg_media(msg.media,
                                                       add_extension=False,
                                                       file_path=output)
                        sleep(self.download_delay)

                except RPCError as e:
                    print('Error downloading photo:', e)
                finally:
                    current += msg.media.photo.sizes[-1].size
                    if progress_callback:
                        progress_callback(
                            current, total,
                            self.calculate_etl(current, total, start))

        # TODO Add an internal callback to determine how the current document download is going,
        # and update our currently saved bytes count based on that
        if dl_docs:
            for msg in db.query_messages(
                    self.get_query(MessageMediaDocument, before_date,
                                   after_date)):
                if not self.backup_running:
                    return

                if not docs_max_size or msg.media.document.size <= docs_max_size:
                    # Try downloading the document
                    output = self.media_handler.get_msg_media_path(msg)
                    try:
                        if not self.valid_file_exists(output):
                            self.client.download_msg_media(msg.media,
                                                           add_extension=False,
                                                           file_path=output)
                        sleep(self.download_delay)

                    except RPCError as e:
                        print('Error downloading document:', e)
                    finally:
                        current += msg.media.document.size
                        if progress_callback:
                            progress_callback(
                                current, total,
                                self.calculate_etl(current, total, start))
        db.close()

    #endregion

    #endregion

    #region Utilities

    def calculate_etl(self, downloaded, total, start=None):
        """Calculates the estimated time left, based on how long it took us
           to reach "downloaded" and how many messages we have left.

           If no start time is given, the time will simply by estimated by how
           many chunks are left, which will NOT work if what is being downloaded is media"""
        left = total - downloaded
        if not start:
            # We add chunk size - 1 because division will truncate the decimal places,
            # so for example, if we had a chunk size of 8:
            #   7 messages + 7 = 14 -> 14 // 8 = 1 chunk download required
            #   8 messages + 7 = 15 -> 15 // 8 = 1 chunk download required
            #   9 messages + 7 = 16 -> 16 // 8 = 2 chunks download required
            #
            # Clearly, both 7 and 8 fit in one chunk, but 9 doesn't.
            chunks_left = (left + self.download_chunk_size -
                           1) // self.download_chunk_size
            etl = chunks_left * self.download_delay
        else:
            if downloaded:
                delta_time = (datetime.now() -
                              start).total_seconds() / downloaded
                etl = left * delta_time
            else:
                etl = 0

        return timedelta(seconds=round(etl, 1))

    @staticmethod
    def get_query(clazz, before_date=None, after_date=None):
        """Returns a database query filtering by media_id (its class),
           and optionally range dates"""
        filters = 'where media_id = {}'.format(clazz.constructor_id)
        if before_date:
            filters += " and date <= '{}'".format(before_date)
        if after_date:
            filters += " and date >= '{}'".format(after_date)
        return filters

    @staticmethod
    def valid_file_exists(file):
        """Determines whether a file exists and its "valid"
           (i.e., the file size is greater than 0; if it's 0, it probably faild dueto an RPC error)"""
        return path.isfile(file) and path.getsize(file) > 0
Пример #6
0
 def __init__(self, backups_dir, name):
     self.backups_dir = backups_dir
     self.name = name
     self.output_dir = path.join(Exporter.export_dir, name)
     self.media_handler = MediaHandler(self.output_dir)
Пример #7
0
class Exporter:
    """Class used to export database files"""

    # Default output directory for all the exported backups
    export_dir = 'backups/exported'

    def __init__(self, backups_dir, name):
        self.backups_dir = backups_dir
        self.name = name
        self.output_dir = path.join(Exporter.export_dir, name)
        self.media_handler = MediaHandler(self.output_dir)

    #region Exporting databases

    def export(self, callback=None):
        """Exports the given database with the specified name.
           An optional callback function can be given with one
           dictionary parameter containing progress information
           (saved_msgs, total_msgs, etl)"""

        Thread(target=self.export_thread, kwargs={ 'callback': callback }).start()

    def copy_default_media(self):
        """Copies the default media and style sheets to the output directory"""

        makedirs(self.output_dir, exist_ok=True)
        copyfile('exporter/resources/style.css', path.join(self.output_dir, 'style.css'))

        self.media_handler.make_tree()
        copyfile('exporter/resources/default_propic.png',
                 self.media_handler.get_default_file('propics'))

        copyfile('exporter/resources/default_photo.png',
                 self.media_handler.get_default_file('photos'))

    def export_thread(self, callback):
        """The exporting a conversation method (should be ran in a different thread)"""

        with TLDatabase(self.backups_dir) as db:
            db_media_handler = MediaHandler(self.backups_dir)

            # First copy the default media files
            self.copy_default_media()

            progress = {
                'exported': 0,
                'total': db.count('messages'),
                'etl': 'Unknown'
            }

            # The first date will obviously be the first day
            # TODO This fails if there are 0 messages in the database, export should be disabled!
            previous_date = self.get_message_date(db.query_message('order by id asc'))

            # Also find the next day
            following_date = self.get_previous_and_next_day(db, previous_date)[1]

            # Set the first writer (which will have the "previous" date, the first one)
            writer = HTMLTLWriter(previous_date, self.media_handler,
                                  following_date=following_date)

            # Keep track from when we started to determine the estimated time left
            start = datetime.now()

            # Export the profile photos, from users chats and channels
            # TODO This should also have a progress if we have a backup of thousands of files!
            for user in db.query_users():
                if user.photo:
                    source = db_media_handler.get_propic_path(user)
                    output = self.media_handler.get_propic_path(user)
                    if isfile(source):
                        copyfile(source, output)

            # Iterate over all the messages to export them in their respective days
            for msg in db.query_messages('order by id asc'):
                msg_date = self.get_message_date(msg)
                progress['exported'] += 1

                # As soon as we're in the next day, update the output the writer
                if msg_date != previous_date:
                    # Exit the previous writer to end the header
                    writer.__exit__(None, None, None)

                    # Update date values and create a new instance
                    previous_date, following_date =\
                        self.get_previous_and_next_day(db, msg_date)

                    writer = HTMLTLWriter(msg_date, self.media_handler,
                                          previous_date=previous_date,
                                          following_date=following_date)
                    # Call the callback
                    if callback:
                        progress['etl'] = self.calculate_etl(start, progress['exported'], progress['total'])
                        callback(progress)
                    else:
                        print(progress)

                writer.write_message(msg, db)
                # If the message has media, we need to copy it so it's accessible by the exported HTML
                if not isinstance(msg, MessageService) and msg.media:
                    source = db_media_handler.get_msg_media_path(msg)
                    output = self.media_handler.get_msg_media_path(msg)
                    # Source may be None if the media is unsupported (i.e. a webpage)
                    if source and isfile(source):
                        copyfile(source, output)

                previous_date = msg_date

            # Always exit at the end
            writer.__exit__(None, None, None)
            # Call the callback to notify we've finished
            if callback:
                progress['etl'] = timedelta(seconds=0)
                callback(progress)

    #endregion

    #region Utilities

    @staticmethod
    def get_previous_and_next_day(db, message_date):
        """Gets the previous and following saved days given the day in between in the database"""
        previous = db.query_message("where date < '{}' order by id desc"
                                    .format(message_date))
        following = db.query_message("where date >= '{}' order by id asc"
                                     .format(message_date+timedelta(days=1)))

        return Exporter.get_message_date(previous), Exporter.get_message_date(following)

    @staticmethod
    def calculate_etl(start, saved, total):
        """Calculates the estimated time left, based on how long it took us
           to reach "saved" and how many messages we have left"""
        delta_time = (datetime.now() - start).total_seconds() / saved
        left = total - saved
        return timedelta(seconds=round(left * delta_time, 1))

    @staticmethod
    def get_message_date(message):
        """Retrieves the given message DATE, ignoring the time (hour, minutes, seconds, etc.)"""
        if message:
            return date(year=message.date.year, month=message.date.month, day=message.date.day)
Пример #8
0
    def export_thread(self, callback):
        """The exporting a conversation method (should be ran in a different thread)"""

        with TLDatabase(self.backups_dir) as db:
            db_media_handler = MediaHandler(self.backups_dir)

            # First copy the default media files
            self.copy_default_media()

            progress = {
                'exported': 0,
                'total': db.count('messages'),
                'etl': 'Unknown'
            }

            # The first date will obviously be the first day
            # TODO This fails if there are 0 messages in the database, export should be disabled!
            previous_date = self.get_message_date(
                db.query_message('order by id asc'))

            # Also find the next day
            following_date = self.get_previous_and_next_day(db,
                                                            previous_date)[1]

            # Set the first writer (which will have the "previous" date, the first one)
            writer = HTMLTLWriter(previous_date,
                                  self.media_handler,
                                  following_date=following_date)

            # Keep track from when we started to determine the estimated time left
            start = datetime.now()

            # Export the profile photos, from users chats and channels
            # TODO This should also have a progress if we have a backup of thousands of files!
            for user in db.query_users():
                if user.photo:
                    source = db_media_handler.get_propic_path(user)
                    output = self.media_handler.get_propic_path(user)
                    if isfile(source):
                        copyfile(source, output)

            # Iterate over all the messages to export them in their respective days
            for msg in db.query_messages('order by id asc'):
                msg_date = self.get_message_date(msg)
                progress['exported'] += 1

                # As soon as we're in the next day, update the output the writer
                if msg_date != previous_date:
                    # Exit the previous writer to end the header
                    writer.__exit__(None, None, None)

                    # Update date values and create a new instance
                    previous_date, following_date =\
                        self.get_previous_and_next_day(db, msg_date)

                    writer = HTMLTLWriter(msg_date,
                                          self.media_handler,
                                          previous_date=previous_date,
                                          following_date=following_date)
                    # Call the callback
                    if callback:
                        progress['etl'] = self.calculate_etl(
                            start, progress['exported'], progress['total'])
                        callback(progress)
                    else:
                        print(progress)

                writer.write_message(msg, db)
                # If the message has media, we need to copy it so it's accessible by the exported HTML
                if not isinstance(msg, MessageService) and msg.media:
                    source = db_media_handler.get_msg_media_path(msg)
                    output = self.media_handler.get_msg_media_path(msg)
                    # Source may be None if the media is unsupported (i.e. a webpage)
                    if source and isfile(source):
                        copyfile(source, output)

                previous_date = msg_date

            # Always exit at the end
            writer.__exit__(None, None, None)
            # Call the callback to notify we've finished
            if callback:
                progress['etl'] = timedelta(seconds=0)
                callback(progress)
Пример #9
0
 def __init__(self, backups_dir, name):
     self.backups_dir = backups_dir
     self.name = name
     self.output_dir = path.join(Exporter.export_dir, name)
     self.media_handler = MediaHandler(self.output_dir)
Пример #10
0
class Exporter:
    """Class used to export database files"""

    # Default output directory for all the exported backups
    export_dir = 'backups/exported'

    def __init__(self, backups_dir, name):
        self.backups_dir = backups_dir
        self.name = name
        self.output_dir = path.join(Exporter.export_dir, name)
        self.media_handler = MediaHandler(self.output_dir)

    #region Exporting databases

    def export(self, callback=None):
        """Exports the given database with the specified name.
           An optional callback function can be given with one
           dictionary parameter containing progress information
           (saved_msgs, total_msgs, etl)"""

        Thread(target=self.export_thread, kwargs={
            'callback': callback
        }).start()

    def copy_default_media(self):
        """Copies the default media and style sheets to the output directory"""

        makedirs(self.output_dir, exist_ok=True)
        copyfile('exporter/resources/style.css',
                 path.join(self.output_dir, 'style.css'))

        self.media_handler.make_tree()
        copyfile('exporter/resources/default_propic.png',
                 self.media_handler.get_default_file('propics'))

        copyfile('exporter/resources/default_photo.png',
                 self.media_handler.get_default_file('photos'))

    def export_thread(self, callback):
        """The exporting a conversation method (should be ran in a different thread)"""

        with TLDatabase(self.backups_dir) as db:
            db_media_handler = MediaHandler(self.backups_dir)

            # First copy the default media files
            self.copy_default_media()

            progress = {
                'exported': 0,
                'total': db.count('messages'),
                'etl': 'Unknown'
            }

            # The first date will obviously be the first day
            # TODO This fails if there are 0 messages in the database, export should be disabled!
            previous_date = self.get_message_date(
                db.query_message('order by id asc'))

            # Also find the next day
            following_date = self.get_previous_and_next_day(db,
                                                            previous_date)[1]

            # Set the first writer (which will have the "previous" date, the first one)
            writer = HTMLTLWriter(previous_date,
                                  self.media_handler,
                                  following_date=following_date)

            # Keep track from when we started to determine the estimated time left
            start = datetime.now()

            # Export the profile photos, from users chats and channels
            # TODO This should also have a progress if we have a backup of thousands of files!
            for user in db.query_users():
                if user.photo:
                    source = db_media_handler.get_propic_path(user)
                    output = self.media_handler.get_propic_path(user)
                    if isfile(source):
                        copyfile(source, output)

            # Iterate over all the messages to export them in their respective days
            for msg in db.query_messages('order by id asc'):
                msg_date = self.get_message_date(msg)
                progress['exported'] += 1

                # As soon as we're in the next day, update the output the writer
                if msg_date != previous_date:
                    # Exit the previous writer to end the header
                    writer.__exit__(None, None, None)

                    # Update date values and create a new instance
                    previous_date, following_date =\
                        self.get_previous_and_next_day(db, msg_date)

                    writer = HTMLTLWriter(msg_date,
                                          self.media_handler,
                                          previous_date=previous_date,
                                          following_date=following_date)
                    # Call the callback
                    if callback:
                        progress['etl'] = self.calculate_etl(
                            start, progress['exported'], progress['total'])
                        callback(progress)
                    else:
                        print(progress)

                writer.write_message(msg, db)
                # If the message has media, we need to copy it so it's accessible by the exported HTML
                if not isinstance(msg, MessageService) and msg.media:
                    source = db_media_handler.get_msg_media_path(msg)
                    output = self.media_handler.get_msg_media_path(msg)
                    # Source may be None if the media is unsupported (i.e. a webpage)
                    if source and isfile(source):
                        copyfile(source, output)

                previous_date = msg_date

            # Always exit at the end
            writer.__exit__(None, None, None)
            # Call the callback to notify we've finished
            if callback:
                progress['etl'] = timedelta(seconds=0)
                callback(progress)

    #endregion

    #region Utilities

    @staticmethod
    def get_previous_and_next_day(db, message_date):
        """Gets the previous and following saved days given the day in between in the database"""
        previous = db.query_message(
            "where date < '{}' order by id desc".format(message_date))
        following = db.query_message(
            "where date >= '{}' order by id asc".format(message_date +
                                                        timedelta(days=1)))

        return Exporter.get_message_date(previous), Exporter.get_message_date(
            following)

    @staticmethod
    def calculate_etl(start, saved, total):
        """Calculates the estimated time left, based on how long it took us
           to reach "saved" and how many messages we have left"""
        delta_time = (datetime.now() - start).total_seconds() / saved
        left = total - saved
        return timedelta(seconds=round(left * delta_time, 1))

    @staticmethod
    def get_message_date(message):
        """Retrieves the given message DATE, ignoring the time (hour, minutes, seconds, etc.)"""
        if message:
            return date(year=message.date.year,
                        month=message.date.month,
                        day=message.date.day)