def entry_from_file_path(file_path: Path, source: BaseSource) -> Entry: """ Creates an Entry template from a file path, filling the fields with file metadata. """ mimetype = get_mimetype(file_path) entry = Entry( title=file_path.name, source=source.entry_source, schema=get_schema_from_mimetype(mimetype), extra_attributes={ 'file': { 'checksum': get_checksum(file_path), 'path': str(file_path.resolve()), 'mimetype': mimetype, }, }, ) entry.date_on_timeline = get_file_entry_date(entry) if mimetype: if mimetype.startswith('image/'): entry.schema = 'file.image' entry.extra_attributes.update( get_image_extra_attributes(file_path)) if mimetype.startswith('video/'): entry.schema = 'file.video' try: entry.extra_attributes.update( get_video_extra_attributes(file_path)) except FileFormatError: logger.exception( f"Could not read metadata for video {str(file_path)}") if mimetype.startswith('audio/'): entry.schema = 'file.audio' entry.extra_attributes.update( get_audio_extra_attributes(file_path)) if mimetype.startswith('text/'): entry.schema = 'file.text' with file_path.open('r') as text_file: entry.description = text_file.read( settings.MAX_PLAINTEXT_PREVIEW_SIZE) return entry
elif mimetype and mimetype.startswith('video'): entry.schema = 'message.telegram.video' elif mimetype and mimetype.startswith('image'): entry.schema = 'message.telegram.image' else: entry = Entry() if message.get('media_type') == 'sticker': entry.schema = 'message.telegram.sticker' elif message.get('media_type') == 'animation': entry.schema = 'message.telegram.gif' else: entry.schema = 'message.telegram' entry.source = self.entry_source entry.description = self.get_message_text(message) entry.date_on_timeline = self.get_message_date(message) # Set message metadata if chat['type'] == 'personal_chat': # For personal chats, messages are from one user to another user. # In the telegram data, the chat ID is the same as the other user's ID. if message['from_id'] == self.account_id( account): # Outgoing private msg entry.extra_attributes.update({ 'sender_name': self.account_name(account), 'sender_id': message['from_id'], 'recipient_name': chat['name'], 'recipient_id':
def create_entries_from_directory(path: Path, source: BaseSource, backup_date: datetime, use_cache=True) -> List[Entry]: """ Delete and recreate the Entries for the files in a directory. """ timelineinclude_rules = list( get_include_rules_for_dir(path, settings.TIMELINE_INCLUDE_FILE)) files = list( get_files_matching_rules(get_files_in_dir(path), timelineinclude_rules)) inode_checksum_cache = {} # translates file inodes to checksums metadata_cache = {} # translates checksums to entry metadata cached_extra_attributes = ('location', 'media', 'previews') if use_cache: # Most files in a directory already have a matching Entry. Recalculating the metadata for each file Entry is # wasteful and time-consuming. # Instead, we build a cache of all files that have an Entry. If we process a file that already has an Entry (if # they have the same inode), we can reuse the cached Entry metadata. for entry in source.get_entries(): try: # We also avoid calculating checksums if we don't have to. Instead, we compare the file inodes. If the # inodes are the same, THEN we calculate and compare the checksums. If the file in the Entry and the # file in the directory have the same checksum, then they're identical, and we can reuse the metadata. entry_file_inode = Path( entry.extra_attributes['file']['path']).stat().st_ino inode_checksum_cache[ entry_file_inode] = entry.extra_attributes['file'][ 'checksum'] except FileNotFoundError: # This can happen if the file in the Entry was deleted or moved. pass metadata = {} for attribute in cached_extra_attributes: if attribute in entry.extra_attributes: metadata[attribute] = entry.extra_attributes[attribute] if entry.description: metadata['description'] = entry.description metadata_cache[entry.extra_attributes['file'] ['checksum']] = metadata entries_to_create = [] for file in files: file.resolve() try: checksum = inode_checksum_cache.get( file.stat().st_ino) or get_checksum(file) except OSError: logger.exception(f"Could not generate checksum for {str(file)}") raise if checksum in metadata_cache: mimetype = get_mimetype(file) entry = Entry(title=file.name, source=source.entry_source, schema=get_schema_from_mimetype(mimetype), description=metadata_cache[checksum].get( 'description', ''), extra_attributes={ 'file': { 'path': str(file), 'checksum': checksum, 'mimetype': mimetype, }, }) for attribute in cached_extra_attributes: if attribute in metadata_cache[checksum]: entry.extra_attributes[attribute] = metadata_cache[ checksum][attribute] else: entry = entry_from_file_path(file, source) entry.extra_attributes['backup_date'] = datetime_to_json(backup_date) entry.date_on_timeline = get_file_entry_date( entry) # This could change, so it's not cached entries_to_create.append(entry) source.get_entries().delete( ) # TODO: Only delete the entries in the specified directory? return Entry.objects.bulk_create(entries_to_create)