def compute(file_obj): """Compute a metadata-invariant fingerprint from an MP3 file. Args: file_obj: A file-like object Returns: A string containing the fingerprint, which takes the form of a 40-character sequence of hex digits. If no valid MPEG frames are found, None is returned. """ sha1_calc = hashlib.sha1() saw_a_valid_frame = False for hdr, data_buffer in mp3_frame.split(file_obj): if hdr is not None: sha1_calc.update(data_buffer) saw_a_valid_frame = True if saw_a_valid_frame: return sha1_calc.hexdigest() else: return None
def analyze(file_obj, au_file, compute_fingerprint=True, get_payload=True): """Populate an AudioFile object with information extracted from a file. Args: file_obj: A file-like object. au_file: An AudioFile object to store the results of the analysis in. compute_fingerprint: If False, do not compute a fingerprint. Returns: The same AudioFile object that was passed in as au_file, which should now have several fields set. Raises: InvalidFileError: if the file appears to be corrupted. """ au_file.frame_count = 0 au_file.frame_size = 0 au_file.duration_ms = 0 sha1_calc = hashlib.sha1() # unused if compute_fingerprint is False. payload = cStringIO.StringIO() # unused if get_payload is False. bit_rate_kbps_sum = 0 expected_hdr = None first_bit_rate_kbps = None is_vbr = False for hdr, data_buffer in mp3_frame.split(file_obj): if hdr is None: continue au_file.frame_count += 1 au_file.frame_size += len(data_buffer) au_file.duration_ms += hdr.duration_ms if compute_fingerprint: sha1_calc.update(data_buffer) if get_payload: payload.write(data_buffer) # If we've seen a valid header previously, make sure that all of the # fields that should match do actually match. if expected_hdr: if not hdr.match(expected_hdr): raise InvalidFileError( "Bad header: found %s, expected %s (path=%s)" % (hdr, expected_hdr, au_file.path)) # Keep track of if this is a variable bit-rate file. if hdr.bit_rate_kbps != first_bit_rate_kbps: is_vbr = True # Add this frame's bit rate to our sum; we will use this to compute # the average bit rate. bit_rate_kbps_sum += hdr.bit_rate_kbps # If this is the first header we've seen, make a copy and then blank # out the fields that can vary. All future headers are expected to # match this template. if expected_hdr is None: expected_hdr = hdr first_bit_rate_kbps = expected_hdr.bit_rate_kbps expected_hdr.bit_rate_kbps = None # Might be a VBR file. expected_hdr.padding = None # Not all frames are padded. expected_hdr.frame_size = None # You'd think that this would be constant, but MP3s # encountered in the wild prove otherwise. expected_hdr.protected = None if au_file.frame_count < _MINIMUM_FRAMES: raise InvalidFileError("Found only %d MPEG frames" % au_file.frame_count) # Add the bit rate back into the template header, then return it. # If this is a VBR file, use the average bit rate instead. if is_vbr: expected_hdr.bit_rate_kbps = (float(bit_rate_kbps_sum) / au_file.frame_count) else: expected_hdr.bit_rate_kbps = first_bit_rate_kbps # Finishing populating and then return the AudioFile object. au_file.mp3_header = expected_hdr # Round the duration down to an integral number of microseconds. au_file.duration_ms = int(au_file.duration_ms) if compute_fingerprint: au_file.fingerprint = sha1_calc.hexdigest() if get_payload: au_file.payload = payload.getvalue() return au_file
def test_split(self): raw_hdr, hdr = mp3_header_test.VALID_MP3_HEADERS.items()[0] frame_data = raw_hdr.ljust(hdr.frame_size, "a") # Set up a fragment of a header partial_header = raw_hdr[:3] short_frame = frame_data[:25] assert len(short_frame) < len(frame_data) id3_data = id3_header.create_test_header(77).ljust(77, "b") # An ID3 tag with a valid frame tag stashed inside. evil_id3_data = id3_header.create_test_header(50) + raw_hdr evil_id3_data = evil_id3_data.ljust(50, "c") for seq in ( [ frame_data ], [ frame_data, frame_data ], [ 'junk', frame_data ], [ 'junk', frame_data, frame_data ], [ 'junk', frame_data, frame_data, 'junk' ], [ 'junk', frame_data, frame_data, 'junk', frame_data ], # Check handling of truncated headers and frames. [ partial_header ], [ 'junk', partial_header ], [ 'junk', short_frame ], [ frame_data, partial_header ], [ frame_data, short_frame ], [ frame_data, 'junk', short_frame ], [ frame_data, 'junk', partial_header], # ID3 headers mixed in [ id3_data, frame_data ], [ frame_data, id3_data ], [ id3_data, frame_data ], [ id3_data, frame_data, id3_data ], [ evil_id3_data, frame_data, "junk" ], [ "junk", frame_data, evil_id3_data, frame_data ], [ "junk", frame_data, evil_id3_data, frame_data, "junk" ], [ "junk" + evil_id3_data, id3_data, frame_data, evil_id3_data ], # Some longer sequences 500 * [ frame_data ], 500 * [ "junk", frame_data, id3_data, frame_data ] ): data = ''.join(seq) stream = cStringIO.StringIO(data) split_stream = list(mp3_frame.split(stream)) split_stream_from_blocks = list(mp3_frame.split_blocks(iter(seq))) split_stream_from_one_block = mp3_frame.split_one_block(data) # Make sure that the sequences of header/frame data pairs # returned by mp3_frame.split(), mp3_frame.split_blocks() # and mp3_frame.split_one_block() matche what we would # expect. self.assertEqual(len(seq), len(split_stream)) for expected_data, (actual_hdr, data) in zip(seq, split_stream): self.assertEqual(expected_data, data) if expected_data == frame_data: self.assertTrue(actual_hdr is not None) self.assertTrue(actual_hdr.match(hdr)) self.assertEqual(hdr.frame_size, len(frame_data)) else: self.assertTrue(actual_hdr is None) self.assertEqual(len(seq), len(split_stream_from_blocks)) for (hdr1, data1), (hdr2, data2) in zip(split_stream, split_stream_from_blocks): self.assertEqual(str(hdr1), str(hdr2)) self.assertEqual(data1, data2) self.assertEqual(len(seq), len(split_stream_from_one_block)) for (hdr1, data1), (hdr2, data2) in zip( split_stream, split_stream_from_one_block): self.assertEqual(str(hdr1), str(hdr2)) self.assertEqual(data1, data2)
def analyze(file_obj, au_file, compute_fingerprint=True, get_payload=True): """Populate an AudioFile object with information extracted from a file. Args: file_obj: A file-like object. au_file: An AudioFile object to store the results of the analysis in. compute_fingerprint: If False, do not compute a fingerprint. Returns: The same AudioFile object that was passed in as au_file, which should now have several fields set. Raises: InvalidFileError: if the file appears to be corrupted. """ au_file.frame_count = 0 au_file.frame_size = 0 au_file.duration_ms = 0 sha1_calc = hashlib.sha1() # unused if compute_fingerprint is False. payload = cStringIO.StringIO() # unused if get_payload is False. bit_rate_kbps_sum = 0 expected_hdr = None first_bit_rate_kbps = None is_vbr = False for hdr, data_buffer in mp3_frame.split(file_obj): if hdr is None: continue au_file.frame_count += 1 au_file.frame_size += len(data_buffer) au_file.duration_ms += hdr.duration_ms if compute_fingerprint: sha1_calc.update(data_buffer) if get_payload: payload.write(data_buffer) # If we've seen a valid header previously, make sure that all of the # fields that should match do actually match. if expected_hdr: if not hdr.match(expected_hdr): raise InvalidFileError( "Bad header: found %s, expected %s (path=%s)" % ( hdr, expected_hdr, au_file.path)) # Keep track of if this is a variable bit-rate file. if hdr.bit_rate_kbps != first_bit_rate_kbps: is_vbr = True # Add this frame's bit rate to our sum; we will use this to compute # the average bit rate. bit_rate_kbps_sum += hdr.bit_rate_kbps # If this is the first header we've seen, make a copy and then blank # out the fields that can vary. All future headers are expected to # match this template. if expected_hdr is None: expected_hdr = hdr first_bit_rate_kbps = expected_hdr.bit_rate_kbps expected_hdr.bit_rate_kbps = None # Might be a VBR file. expected_hdr.padding = None # Not all frames are padded. expected_hdr.frame_size = None # You'd think that this would be constant, but MP3s # encountered in the wild prove otherwise. expected_hdr.protected = None if au_file.frame_count < _MINIMUM_FRAMES: raise InvalidFileError("Found only %d MPEG frames" % au_file.frame_count) # Add the bit rate back into the template header, then return it. # If this is a VBR file, use the average bit rate instead. if is_vbr: expected_hdr.bit_rate_kbps = ( float(bit_rate_kbps_sum) / au_file.frame_count) else: expected_hdr.bit_rate_kbps = first_bit_rate_kbps # Finishing populating and then return the AudioFile object. au_file.mp3_header = expected_hdr # Round the duration down to an integral number of microseconds. au_file.duration_ms = int(au_file.duration_ms) if compute_fingerprint: au_file.fingerprint = sha1_calc.hexdigest() if get_payload: au_file.payload = payload.getvalue() return au_file