def check_with_matches(filename, matches): if 'IsWEBM' not in matches: return None level = PolyglotLevel() reader = WebmReader(filename) reader.read_ebml_header() reader.read_segment_id() segment_size = reader.read_size() webm_size = segment_size + reader.file.tell() reader.close() file_size = os.stat(filename).st_size if file_size > webm_size: level.add_chunk(webm_size, file_size - webm_size) return level
def check_with_matches(filename, matches): if 'IsPDF' not in matches: return None level = PolyglotLevel() if 'HasTruncatedMagic' in matches: truncated_magic_offset = matches['HasTruncatedMagic'].strings[0][0] magic_offset = matches['HasMagic'].strings[0][0] if 'HasMagic' in matches else None # If the offset of the full magic is the first magic found in the file if not (magic_offset == truncated_magic_offset <= 1024): level.invalid() if truncated_magic_offset > 0: level.add_chunk(0, truncated_magic_offset) eof_match = matches['HasEOF'].strings[-1] if 'HasEOF' in matches else None file_size = os.stat(filename).st_size if eof_match is not None and eof_match[0] + len(eof_match[2]) < file_size: pdf_end = eof_match[0] + len(eof_match[2]) level.add_chunk(pdf_end, file_size - pdf_end) return level
def check_with_matches(filename, matches): if 'IsBMP' not in matches: return None try: with BmpImageFile(filename) as image: level = PolyglotLevel() image.fp.seek(2) image_size = unpack('<I', image.fp.read(4))[0] image.fp.seek(0, io.SEEK_END) file_size = image.fp.tell() if file_size != image_size: level.add_chunk(image_size, file_size - image_size) return level except SyntaxError: return None
def check_with_matches(filename, matches): if 'IsGIF' not in matches: return None try: with GifImageFile(filename) as image: image.seek(image.n_frames - 1) while image.data(): # Pass the last frame pass level = PolyglotLevel() image_end = image.fp.tell() if image.fp.read(1) == b';': image_end += 1 image.fp.seek(0, io.SEEK_END) image_size = image.fp.tell() if image_end != image_size: level.add_chunk(image_end, image_size - image_end) return level except SyntaxError: return None
def check_with_matches(filename, matches): if 'IsPNG' not in matches: return None with open(filename, 'rb') as file: if file.read(len(_MAGIC)) != _MAGIC: return None try: name = '' while name != _PNG_END_SECTION: name, length = read_section(file) file.seek(length + _CRC_SIZE, io.SEEK_CUR) png_end = file.tell() file.seek(0, io.SEEK_END) file_size = file.tell() level = PolyglotLevel() if png_end != file_size: level.add_chunk(png_end, file_size - png_end) return level except SyntaxError: return None
def check_with_matches(filename, matches): if 'IsJPG' not in matches: return None with open(filename, 'rb') as file: try: with mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ) as buf: level = PolyglotLevel() buf.seek(len(__JPG_MAGIC)) try: section = b'' while section != __JPG_START_OF_SCAN: section, length = read_section(buf) buf.seek(length - 2, io.SEEK_CUR) except (ValueError, SyntaxError): return level.invalid() scan_offset = buf.tell() # Read the image data until end marker end_marker_matches = matches[ 'HasEndMarker'].strings if 'HasEndMarker' in matches else None end_marker_matches_after_start_of_scan = [ m for m in end_marker_matches if m[0] > scan_offset ] end_marker_offset = end_marker_matches_after_start_of_scan[0][ 0] if end_marker_matches_after_start_of_scan else None if end_marker_offset is not None and end_marker_offset + len( __JPG_END_MARKER) < buf.size(): end_offset = end_marker_offset + len(__JPG_END_MARKER) level.add_chunk(end_offset, buf.size() - end_offset) return level except ValueError: return None
def check_with_matches(filename: str, matches): if 'AVIHeader' not in matches: return None level = PolyglotLevel() start = matches['AVIHeader'].strings[0][0] if start > 0: level.add_chunk(0, start) size = __get_size(matches['AVIHeader'].strings[0][2]) end_offset = start + size file_size = os.stat(filename).st_size if end_offset < file_size: level.add_chunk(end_offset, file_size - end_offset) return level
def check_with_matches(filename: str, matches): if 'MP3Header' not in matches: return None strings = list(filter(__is_good, matches['MP3Header'].strings)) if not strings: return None # Heuristic to reduce the number of false positives: # each frame is 28ms long, so we search for at least 50 frames if len(strings) < 50: return None begin = 0 if 'HasID3' in matches: size = __synchsafe(bytes(matches['HasID3'].strings[0][2][6:])) begin = 10 + size level = PolyglotLevel() first_mp3_header_offset = matches['MP3Header'].strings[0][0] if first_mp3_header_offset > begin: level.add_chunk(0, first_mp3_header_offset) idx = 0 while idx < len(strings): string = strings[idx] third_byte = string[2][2] bitrate = __bitrate_conversion[(int(third_byte) & 0xF0) >> 4] * 1000 sampling_frequency = __sampling_conversion[(int(third_byte) & 0x0C) >> 2] padding = (int(third_byte) & 0x02) >> 1 unit_size = math.floor( 144 * bitrate / sampling_frequency ) + padding # Source for computation : https://www.researchgate.net/publication/225793510_A_study_on_multimedia_file_carving_method, page 8 next_headers = [s for s in strings if s[0] >= string[0] + unit_size] if not next_headers: file_size = os.stat(filename).st_size if file_size != string[0] + unit_size: level.add_chunk(string[0] + unit_size, file_size - (string[0] + unit_size)) break if next_headers[0][0] != string[0] + unit_size: level.add_chunk(string[0] + unit_size, next_headers[0][0] - (string[0] + unit_size)) idx = strings.index(next_headers[0]) return level
def check_with_matches(filename: str, matches): if 'OGGHeader' not in matches: return None level = PolyglotLevel() begin_offset = matches['OGGHeader'].strings[0][0] if begin_offset > 0: level.add_chunk(0, begin_offset) for string_idx, string in enumerate(matches['OGGHeader'].strings): page_size = __get_page_size(filename, string) if string_idx < len(matches['OGGHeader'].strings) - 1: next_header_offset = matches['OGGHeader'].strings[string_idx + 1][0] if next_header_offset > string[0] + page_size: level.add_chunk(string[0] + page_size, next_header_offset - (string[0] + page_size)) else: file_size = os.stat(filename).st_size end_offset = string[0] + page_size if file_size != end_offset: level.add_chunk(end_offset, file_size - end_offset) return level
def check_with_matches(filename, matches): if 'IsRAR' not in matches: return None try: with _RARFile(filename) as rar_file: level = PolyglotLevel(is_valid=rar_file.is_valid) if rar_file.magic_offset != 0: level.add_chunk(0, rar_file.magic_offset) if rar_file.buf.tell() != rar_file.buf.size(): level.add_chunk(rar_file.buf.tell(), rar_file.buf.size() - rar_file.buf.tell()) return level except SyntaxError: return None
def check_with_matches(filename, matches): """ Check if the file is a TIFF file, and if it is, if there is potentially other formats in the file WARNING: The method used to know if there is unusued garbage at the end of the file is not perfect ! It only check if the last used zone is at the end of the file, but it would be very easy for an attacker to craft a TIFF with a tag which has an offset at the end of the file :param filename: Path to the file :return: A PolyglotLevel or None if the file is not a TIFF """ if 'IsTIFF' not in matches: return None try: with _TIFFFile(filename) as image: level = PolyglotLevel() for chunk in image.buf.get_not_read_zones(): # FIXME Add other unreaded zone when parser will read image data # For now we only add the last zone if it is at the end of the file if chunk[0] + chunk[1] == image.buf.size(): level.suspicious_chunks.append(chunk) return level except SyntaxError: return None
def check_with_matches(filename: str, matches): if 'IsHTML' not in matches: return None with open(filename, 'rb') as file, \ mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ) as buf: doc_start = -1 doc_end = -1 doctype_pos = buf.find(__DOCTYPE) if doctype_pos != -1: doc_start = doctype_pos doc_end = doc_start + len(__DOCTYPE) tags = [b'html', b'body', b'script'] for tag in tags: tag_pos = buf.find(b'<' + tag + b'>') if tag_pos != -1: if doc_start == -1: doc_start = tag_pos end_tag_pos = buf.find(b'</' + tag + b'>') if end_tag_pos != -1: doc_end = end_tag_pos + len(tag) + 3 elif doc_end == -1: doc_end = tag_pos + len(tag) + 2 break if doc_start != -1: level = PolyglotLevel() buf.seek(0, io.SEEK_SET) begin_content = buf.read(doc_start) # Read until doc start if not __is_whitespace(begin_content): level.add_chunk(0, doc_start) buf.seek(doc_end) contents = buf.read() if not __is_whitespace(contents): level.add_chunk(doc_end, len(contents)) return level else: return None
def check_with_matches(filename, matches): zip_rule = matches.get('IsZIP', None) if zip_rule is None: return None flag = PolyglotLevel() file_size = os.stat(filename).st_size last_eocd_magic = [s for s in zip_rule.strings if s[1] == '$EOCD_magic'][0] eocd_offset = last_eocd_magic[0] if 'HasZIPMagic' in matches: rules = matches['HasZIPMagic'] sorted_strings = sorted(rules.strings, key=lambda string: string[0]) first_string = sorted_strings[0] if first_string[0] != 0: flag.add_chunk(0, first_string[0]) # TODO Take comment in account ? Mark as less suspicious ? eocd_min_end = eocd_offset + __EOCD_MIN_SIZE if eocd_min_end < file_size: flag.add_chunk(eocd_min_end, file_size - eocd_min_end) if 'IsDOCX' in matches: flag.embed('docx') if 'IsJAR' in matches: flag.embed('jar') if 'IsAPK' in matches: flag.embed('apk') return flag
def check_with_matches(filename, matches): if 'HasPHPOpen' not in matches: return None return PolyglotLevel()
def check_with_matches(filename, matches): if 'IsELF' not in matches: return None return PolyglotLevel()