Пример #1
0
	def train_chunks(self, pts, chunk_data, ref_freq, metric='pcd'):
		"""-------------------------------------------------------------------------
		Gets the pitch track chunks of a recording, generates its pitch distribution
		and returns the PitchDistribution objects as a list. This function is called
		for each of the recordings in the training. The outputs of this function are
		combined in train() and the resultant mode model is obtained.
		----------------------------------------------------------------------------
		pts        : List of pitch tracks of chunks that belong to the same mode.
		             The pitch distributions of these are iteratively generated to
		             use as the sample points of the mode model
		chunk_data : The relevant data about the chunks; source, initial timestamp
		             and final timestamp. The format is the same as slice() of
		             ModeFunctions.
		ref_freq   : Reference frequency to be used in PD/PCD generation. Since this
		             the training function, this should be the annotated tonic of the
		             recording
		metric     : The choice of PCD or PD
		-------------------------------------------------------------------------"""
		dist_list = []
		# Iterates over the pitch tracks of a recording
		for idx in range(len(pts)):
			# Retrieves the relevant information about the current chunk
			src = chunk_data[idx][0]
			interval = (chunk_data[idx][1], chunk_data[idx][2])
			# PitchDistribution of the current chunk is generated
			dist = mf.generate_pd(pts[idx], ref_freq=ref_freq, smooth_factor=self.smooth_factor,
			                      step_size=self.step_size, source=src, segment=interval, overlap=self.overlap)
			if(metric=='pcd'):
				dist = mf.generate_pcd(dist)
			# The resultant pitch distributions are filled in the list to be returned
			dist_list.append(dist)
		return dist_list
Пример #2
0
	def train(self, mode_name, pitch_files, tonic_freqs, metric='pcd', save_dir=''):
		"""-------------------------------------------------------------------------
		For the mode trainings, the requirements are a set of recordings with 
		annotated tonics for each mode under consideration. This function only
		expects the recordings' pitch tracks and corresponding tonics as lists.
		The two lists should be indexed in parallel, so the tonic of ith pitch
		track in the pitch track list should be the ith element of tonic list.
		Once training is completed for a mode, the model wouldbe generated as a 
		PitchDistribution object and saved in a JSON file. For loading these objects
		and other relevant information about the data structure, see the
		PitchDistribution class.
		----------------------------------------------------------------------------
		mode_name     : Name of the mode to be trained. This is only used for naming
						the resultant JSON file, in the form "mode_name.json"
		pitch_files   : List of files with pitch tracks extracted from the recording
						(i.e. single-column files with frequencies)
		tonic_freqs   : List of annotated tonic frequencies of recordings
		metric        : Whether the model should be octave wrapped (Pitch Class
						Distribution: PCD) or not (Pitch Distribution: PD)
		save_dir      : Where to save the resultant JSON files.
		-------------------------------------------------------------------------"""

		# To generate the model pitch distribution of a mode, pitch track of each
		# recording is iteratively converted to cents, according to their respective
		# annotated tonics. Then, these are appended to mode_track and a very long
		# pitch track is generated, as if it is a single very long recording. The 
		# pitch distribution of this track is the mode's model distribution.

		# Normalize the pitch tracks of the mode wrt the tonic frequency and concatenate
		for pf, tonic in zip(pitch_files, tonic_freqs):
			pitch_track = np.loadtxt(pf)
			if pitch_track.ndim > 1:  # assume the first col is time, the second is pitch and the rest is labels etc
				pitch_track = pitch_track[:,1]

			if self.chunk_size == 0:  # use the complete pitch track
				mode_track = mF.hz_to_cent(pitch_track, ref_freq=tonic)
			else:  # slice and used the start of the pitch track
				time_track = np.arange(0, self.frame_rate * len(pitch_track), self.frame_rate)
				pitch_track, segs = mF.slice(time_track, pitch_track, mode_name, self.chunk_size)
				mode_track = mF.hz_to_cent(pitch_track[0], ref_freq=tonic)

		seglen = 'all' if self.chunk_size == 0 else (segs[0][1], segs[0][2])

		# generate the pitch distribution
		pitch_distrib = mF.generate_pd(mode_track, smooth_factor=self.smooth_factor,
		                               step_size=self.step_size, source=pitch_files, segment=seglen)
		if metric == 'pcd':  # convert to pitch class distribution, if specified
			pitch_distrib = mF.generate_pcd(pitch_distrib)

		# save the model to a file, if requested
		if save_dir:
			if not os.path.exists(save_dir):
				os.makedirs(save_dir)
			pitch_distrib.save(mode_name + '.json', save_dir=save_dir)

		return pitch_distrib
Пример #3
0
	def tonic_evaluate(self, mbid, estimated, annotated):
		est_cent = mf.hz_to_cent([estimated], annotated)[0]

		# octave wrapping
		cent_diff = est_cent % self.CENT_PER_OCTAVE

		# check if the tonic is found correct
		bool_tonic = min([cent_diff, self.CENT_PER_OCTAVE - cent_diff]) < self.tolerance

		# convert the cent difference to symbolic interval (P5, m3 etc.)
		for i in self.INTERVAL_SYMBOLS:
			if i[1] <= cent_diff < i[2]:
				interval = i[0]
				break
			elif cent_diff == 1200:
				interval = 'P1'
				break

		# if they are in the same octave the the estimated and octave-wrapped values should be the same (very close)
		same_octave = (est_cent - cent_diff < 0.001)
		
		return {'mbid': mbid, 'tonic_eval': bool_tonic, 'same_octave': same_octave, 'cent_diff': cent_diff,
		        'interval': interval, 'annotated_tonic': annotated, 'estimated_tonic': estimated}
Пример #4
0
	def train(self, mode_name, pt_files, tonic_freqs, metric='pcd', save_dir=''):
		"""-------------------------------------------------------------------------
		For the mode trainings, the requirements are a set of recordings with 
		annotated tonics for each mode under consideration. This function only
		expects the recordings' pitch tracks and corresponding tonics as lists.
		The two lists should be indexed in parallel, so the tonic of ith pitch
		track in the pitch track list should be the ith element of tonic list.

		Each pitch track would be sliced into chunks of size chunk_size and their
		pitch distributions are generated. Then, each of such chunk distributions
		are appended to a list. This list represents the mode by sample points as
		much as the number of chunks. So, the result is a list of PitchDistribution
		objects, i.e. list of structured dictionaries and this is what is saved. 
		----------------------------------------------------------------------------
		mode_name     : Name of the mode to be trained. This is only used for naming
		                the resultant JSON file, in the form "mode_name.json"
		pt_files       : List of pitch tracks (i.e. 1-D list of frequencies)
		tonic_freqs : List of annotated tonics of recordings
		metric        : Whether the model should be octave wrapped (Pitch Class
			            Distribution: PCD) or not (Pitch Distribution: PD)
		save_dir      : Where to save the resultant JSON files.
		-------------------------------------------------------------------------"""
		save_name = mode_name + '.json'
		pitch_distrib_list = []

		# Each pitch track is iterated over and its pitch distribution is generated
		# individually, then appended to pitch_distrib_list. Notice that although we treat
		# each chunk individually, we use a single tonic annotation for each recording
		# so we assume that the tonic doesn't change throughout a recording.

		for pf, tonic in zip(pt_files, tonic_freqs):
			pitch_track = np.loadtxt(pf)
			if pitch_track.ndim > 1:  # assume the first col is time, the second is pitch and the rest is labels etc
				pitch_track = pitch_track[:,1]
			time_track = np.arange(0, (self.frame_rate*len(pitch_track)), self.frame_rate)

			# Current pitch track is sliced into chunks.
			if self.chunk_size == 0: # no slicing
				pts = [pitch_track]
				chunk_data = [pf + '_all']
			else:
				pts, chunk_data = mf.slice(time_track, pitch_track, pf, self.chunk_size,
				                           self.threshold, self.overlap)

			# Each chunk is converted to cents
			pts = [mf.hz_to_cent(k, ref_freq=tonic) for k in pts]

			# This is a wrapper function. It iteratively generates the distribution
			# for each chunk and return it as a list. After this point, we only
			# need to save it. God bless modular programming!
			temp_list = self.train_chunks(pts, chunk_data, tonic, metric)

			# The list is composed of lists of PitchDistributions. So,
			# each list in temp_list corresponds to a recording and each
			# PitchDistribution in that list belongs to a chunk. Since these
			# objects remember everything, we just flatten the list and make
			# life much easier. From now on, each chunk is treated as an individual
			# distribution, regardless of which recording it belongs to.
			for tmp in temp_list:
				pitch_distrib_list.append(tmp)

		# save the model to a file, if requested
		if save_dir:
			if not os.path.exists(save_dir):
				os.makedirs(save_dir)

			# Dump the list of dictionaries in a JSON file.
			dist_json = [{'bins':d.bins.tolist(), 'vals':d.vals.tolist(),
			              'kernel_width':d.kernel_width, 'source':d.source,
			              'ref_freq':d.ref_freq, 'segmentation':d.segmentation,
			              'overlap':d.overlap} for d in pitch_distrib_list]

			with open(os.path.join(save_dir, mode_name + '.json'), 'w') as f:
				json.dump(dist_json, f, indent=2)

		return pitch_distrib_list
Пример #5
0
	def chunk_estimate(self, pitch_track, mode_names=[], mode_name='', mode_dir='./',
		                 est_tonic=True, est_mode=True, distance_method="euclidean",
		                 metric='pcd', ref_freq=440, min_cnt=3, equalSamplePerMode = False):
		"""-------------------------------------------------------------------------
		This function is called by the wrapper estimate() function only. It gets a 
		pitch track chunk, generates its pitch distribution and compares it with the
		chunk distributions of the candidate modes. Then, finds the min_cnt nearest
		neighbors and returns them to estimate(), where these are used to make an
		estimation about the overall recording.
		----------------------------------------------------------------------------
		pitch_track     : Pitch track chunk of the input recording whose tonic and/or
		                  mode is to be estimated. This is only a 1-D list of frequency
		                  values.
		mode_dir        : The directory where the mode models are stored. This is to
		                  load the annotated mode or the candidate mode.
		mode_names      : Names of the candidate modes. These are used when loading
		                  the mode models. If the mode isn't estimated, this parameter
		                  isn't used and can be ignored.
		mode_name       : Annotated mode of the recording. If it's not known and to be
		                  estimated, this parameter isn't used and can be ignored.
		est_tonic       : Whether tonic is to be estimated or not. If this flag is
		                  False, ref_freq is treated as the annotated tonic.
		est_mode        : Whether mode is to be estimated or not. If this flag is
		                  False, mode_name is treated as the annotated mode.
		distance_method : The choice of distance methods. See distance() in
		                  ModeFunctions for more information.
		metric          : Whether the model should be octave wrapped (Pitch Class
		                  Distribution: PCD) or not (Pitch Distribution: PD)
		ref_freq        : Annotated tonic of the recording. If it's unknown, we use
		                  an arbitrary value, so this can be ignored.
		min_cnt         : The number of nearest neighbors of the current chunk to be
		                  returned. The details of this parameter and its implications
		                  are explained in the first lines of estimate().
		-------------------------------------------------------------------------"""
		# Preliminaries before the estimations
		# Cent-to-Hz covnersion is done and pitch distributions are generated
		cent_track = mf.hz_to_cent(pitch_track, ref_freq)
		dist = mf.generate_pd(cent_track, ref_freq=ref_freq,
			                  smooth_factor=self.smooth_factor, step_size=self.step_size)
		dist = mf.generate_pcd(dist) if (metric=='pcd') else dist
		# The model mode distribution(s) are loaded. If the mode is annotated and tonic
		# is to be estimated, only the model of annotated mode is retrieved.
		mode_collections = [self.load_collection(mode, dist_dir=mode_dir) for mode in mode_names]

		if equalSamplePerMode:
			minSamp = min([len(n) for n in mode_collections])
			for i, m in enumerate(mode_collections):
				mode_collections[i] = random.sample(m, minSamp)

		# cum_lens (cummulative lengths) keeps track of number of chunks retrieved from
		# each mode. So that we are able to find out which mode the best performed chunk
		# belongs to.
		cum_lens = np.cumsum([len(col) for col in mode_collections])

		# load mode distribution
		mode_dists = [d for col in mode_collections for d in col]
		mode_dist = self.load_collection(mode_name, dist_dir=mode_dir) if (mode_name!='') else None

		#Initializations of possible output parameters
		tonic_list = [0 for x in range(min_cnt)]
		mode_list = ['' for x in range(min_cnt)]
		min_distance_list = np.zeros(min_cnt)

		# If tonic will be estimated, there are certain common preliminary steps, 
		# regardless of the process being a joint estimation of a tonic estimation.
		if(est_tonic):
			if(metric=='pcd'):
				# This is a precaution step, just to be on the safe side. If there
				# happens to be a peak at the last (and first due to the circular nature
				# of PCD) sample, it is considered as two peaks, one at the end and
				# one at the beginning. To prevent this, we find the global minima
				# of the distribution and shift it to the beginning, i.e. make it the
				# new reference frequency. This new reference could have been any other
				# as long as there is no peak there, but minima is fairly easy to find.
				shift_factor = dist.vals.tolist().index(min(dist.vals))
				dist = dist.shift(shift_factor)
				# anti-freq is the new reference frequency after shift, as mentioned
				# above.
				anti_freq = mf.cent_to_hz([dist.bins[shift_factor]], ref_freq=ref_freq)[0]
				# Peaks of the distribution are found and recorded. These will be treated
				# as tonic candidates.
				peak_idxs, peak_vals = dist.detect_peaks()
			elif(metric=='pd'):
				# Since PD isn't circular, the precaution in PCD is unnecessary here.
				# Peaks of the distribution are found and recorded. These will be treated
				# as tonic candidates.
				peak_idxs, peak_vals = dist.detect_peaks()
				# The number of samples to be shifted is the list [peak indices - zero bin]
				# origin is the bin with value zero and the shifting is done w.r.t. it.
				origin =  np.where(dist.bins==0)[0][0]
				shift_idxs = [(idx - origin) for idx in peak_idxs]

		# Here the actual estimation steps begin

		#Joint Estimation
		### TODO: The first steps of joint estimation are very similar for both Bozkurt and
		### Chordia. We might squeeze them into a single function in ModeFunctions.
		if(est_tonic and est_mode):
			if(metric=='pcd'):
				# PCD doesn't require any prelimimary steps. Generates the distance matrix.
				# The rows are tonic candidates and columns are mode candidates.
				dist_mat = mf.generate_distance_matrix(dist, peak_idxs, mode_dists, method=distance_method)
			elif(metric=='pd'):
				# Since PD lengths aren't equal, zero padding is required and
				# tonic_estimate() of ModeFunctions just does that. It can handle only
				# a single column, so the columns of the matrix are iteratively generated
				dist_mat = np.zeros((len(shift_idxs), len(mode_dists)))
				for m in xrange(len(mode_dists)):
					dist_mat[:,m] = mf.tonic_estimate(dist, shift_idxs, mode_dists[m],
						                              distance_method=distance_method,
						                              metric=metric, step_size=self.step_size)

			# Distance matrix is ready now. Since we need to report min_cnt many
			# nearest neighbors, the loop is iterated min_cnt times and returns
			# one neighbor at each iteration, from closest to futher. When first
			# nearest neighbor is found it's changed to the worst, so in the
			# next iteration, the nearest would be the second nearest and so on.
			for r in xrange(min_cnt):
				# The minima of the distance matrix is found. This is to find
				# the current nearest neighbor chunk.
				min_row = np.where((dist_mat == np.amin(dist_mat)))[0][0]
				min_col = np.where((dist_mat == np.amin(dist_mat)))[1][0]
				# Due to the precaution step of PCD, the reference frequency is
				# changed. That's why it's treated differently than PD. Here,
				# the cent value of the tonic estimate is converted back to Hz.	
				if(metric=='pcd'):
					tonic_list[r] = mf.cent_to_hz([dist.bins[peak_idxs[min_row]]],
						                          anti_freq)[0]
				elif(metric=='pd'):
					tonic_list[r] = mf.cent_to_hz([shift_idxs[min_row] * self.step_size],
						                          ref_freq)[0]
				# We have found out which chunk is our nearest now. Here, we find out
				# which mode it belongs to, from cum_lens.
				mode_list[r] = (mode_names[min(np.where((cum_lens > min_col))[0])],
					           mode_dists[min_col].source[:-6])
				# To observe how close these neighbors are, we report their distances.
				# This doesn't affect the computation at all and it's just for the 
				# evaluating and understanding the behvaviour of the system. 
				min_distance_list[r] = dist_mat[min_row][min_col]
				# The minimum value is replaced with a value larger than maximum,
				# so we can easily find the second nearest neighbor.
				dist_mat[min_row][min_col] = (np.amax(dist_mat) + 1)
			return [[mode_list, tonic_list], min_distance_list.tolist()]

		# Tonic Estimation
		elif(est_tonic):
			# This part assigns the special case changes to standard variables,
			# so that we can treat PD and PCD in the same way, as much as
			# possible.
			peak_idxs = shift_idxs if metric=='pd' else peak_idxs
			anti_freq = ref_freq if metric=='pd' else anti_freq

			# Distance matrix is generated. In the mode_estimate() function
			# of ModeFunctions, PD and PCD are treated differently and it
			# handles the special cases such as zero-padding. The mode is
			# already known, so there is only one mode collection, i.e.
			# set of chunk distributions that belong to the same mode, to
			# be compared. Each column is a chunk distribution and each
			# row is a tonic candidate.
			dist_mat = [mf.tonic_estimate(dist, peak_idxs, d,
				                          distance_method=distance_method,
				                          metric=metric, step_size=self.step_size) for d in mode_dist]

			# Distance matrix is ready now. Since we need to report min_cnt many
			# nearest neighbors, the loop is iterated min_cnt times and returns
			# one neighbor at each iteration, from closest to futher. When first
			# nearest neighbor is found it's changed to the worst, so in the
			# next iteration, the nearest would be the second nearest and so on.
			for r in xrange(min_cnt):
				# The minima of the distance matrix is found. This is to find
				# the current nearest neighbor chunk.
				min_row = np.where((dist_mat == np.amin(dist_mat)))[0][0]
				min_col = np.where((dist_mat == np.amin(dist_mat)))[1][0]
				# The corresponding tonic candidate is found, based on the
				# current nearest neighbor and it's distance is recorded
				tonic_list[r] = (mf.cent_to_hz([dist.bins[peak_idxs[min_col]]],
					                           anti_freq)[0], mode_dist[min_row].source[:-6])
				min_distance_list[r] = dist_mat[min_row][min_col]
				# The minimum value is replaced with a value larger than maximum,
				# so we can easily find the second nearest neighbor.
				dist_mat[min_row][min_col] = (np.amax(dist_mat) + 1)
			return [tonic_list, min_distance_list.tolist()]

		# Mode estimation
		elif(est_mode):
			# Only in mode estimation, the distance matrix is actually a vector.
			# Since the tonic is annotated, the distribution isn't shifted and
			# compared to each chunk distribution of each candidate mode.
			# Again, mode_estimate() of ModeFunctions handles the different
			# approach required for PCD and PD.
			distance_vector = mf.mode_estimate(dist, mode_dists,
				                               distance_method=distance_method,
				                               metric=metric, step_size=self.step_size)
			
			# Distance matrix is ready now. Since we need to report min_cnt many
			# nearest neighbors, the loop is iterated min_cnt times and returns
			# one neighbor at each iteration, from closest to futher. When first
			# nearest neighbor is found it's changed to the worst, so in the
			# next iteration, the nearest would be the second nearest and so on.
			for r in xrange(min_cnt):
				# The minima of the distance matrix is found. This is to find
				# the current nearest neighbor chunk.
				idx = np.argmin(distance_vector)
				# We have found out which chunk is our nearest now. Here, we find out
				# which mode it belongs to, from cum_lens.
				mode_list[r] = (mode_names[min(np.where((cum_lens > idx))[0])],
					                                    mode_dists[idx].source[:-6])
				# The distance of the current nearest neighbors recorded. The details
				# of this step is explained in the end of the analogous loop in joint
				# estimation of thşs function.
				min_distance_list[r] = distance_vector[idx]
				# The minimum value is replaced with a value larger than maximum,
				# so we can easily find the second nearest neighbor.
				distance_vector[idx] = (np.amax(distance_vector) + 1) 
			return [mode_list, min_distance_list.tolist()]

		else:
			return 0
Пример #6
0
	def estimate(self, pitch_file, mode_names=[], mode_name='', mode_dir='./', est_mode=True,
		         distance_method="euclidean", metric='pcd', tonic_freq=None,
		         k_param=1, equalSamplePerMode = False):
		"""-------------------------------------------------------------------------
		In the estimation phase, the input pitch track is sliced into chunk and each
		chunk is compared with each candidate mode's each sample model, i.e. with 
		the distributions of each training recording's each chunk. This function is
		a wrapper, that handles decision making portion and the overall flow of the
		estimation process. Internally, segment estimate is called for generation of
		distance matrices and detecting neighbor distributions.

		1) Joint Estimation: Neither the tonic nor the mode of the recording is known.
		Then, joint estimation estimates both of these parameters without any prior
		knowledge about the recording.
		To use this: est_mode and est_tonic flags should be True since both are to
		be estimated. In this case tonic_freq and mode_name parameters are not used,
		since these are used to pass the annotated data about the recording.

		2) Tonic Estimation: The mode of the recording is known and tonic is to be
		estimated. This is generally the most accurate estimation among the three.
		To use this: est_tonic should be True and est_mode should be False. In this
		case tonic_freq  and mode_names parameters are not used since tonic isn't
		known a priori and mode is known and hence there is no candidate mode.

		3) Mode Estimation: The tonic of the recording is known and mode is to be
		estimated.
		To use this: est_mode should be True and est_tonic should be False. In this
		case mode_name parameter isn't used since the mode annotation is not
		available. It can be ignored.
		----------------------------------------------------------------------------
		pitch_file      : File in which the pitch track of the input recording
						whose tonic and/or mode is to be estimated.
		mode_dir        : The directory where the mode models are stored. This is to
						load the annotated mode or the candidate mode.
		mode_names      : Names of the candidate modes. These are used when loading
						the mode models. If the mode isn't estimated, this parameter
						isn't used and can be ignored.
		mode_name       : Annotated mode of the recording. If it's not known and to be
						estimated, this parameter isn't used and can be ignored.
		est_tonic       : Whether tonic is to be estimated or not. If this flag is
						False, tonic_freq is treated as the annotated tonic.
		est_mode        : Whether mode is to be estimated or not. If this flag is
						False, mode_name is treated as the annotated mode.
		k_param         : The k parameter of K Nearest Neighbors. 
		distance_method : The choice of distance methods. See distance() in
						ModeFunctions for more information.
		metric          : Whether the model should be octave wrapped (Pitch Class
						Distribution: PCD) or not (Pitch Distribution: PD)
		tonic_freq        : Annotated tonic of the recording. If it's unknown, we use
						an arbitrary value, so this can be ignored.
		-------------------------------------------------------------------------"""
		# load pitch track
		pitch_track = np.loadtxt(pitch_file)

		# assume the first col is time, the second is pitch and the rest is labels etc.
		pitch_track = pitch_track[:,1] if pitch_track.ndim > 1 else pitch_track

		# Pitch track is sliced into chunks.
		time_track = np.arange(0, (self.frame_rate*len(pitch_track)), self.frame_rate)

		if self.chunk_size == 0: # no slicing
			pts = [pitch_track]
			chunk_data = ['input_all']
		else:
			pts, chunk_data = mf.slice(time_track, pitch_track, 'input', self.chunk_size,
				                 self.threshold, self.overlap)


		# Here's a neat trick. In order to return an estimation about the entire
		# recording based on our observations on individual chunks, we look at the
		# nearest neighbors of  union of all chunks. We are returning min_cnt
		# many number of closest neighbors from each chunk. To make sure that we
		# capture all of the nearest neighbors, we return a little more than
		# required and then treat the union of these nearest neighbors as if it's
		# the distance matrix of the entire recording.Then, we find the nearest
		# neighbors from the union of these from each chunk. This is quite an
		# overshoot, we only need min_cnt >= k_param. 

		### TODO: shrink this value as much as possible.
		min_cnt = len(pts) * k_param

		#Initializations
		tonic_list = 0
		mode_list = ''

		# parse tonic input
		if tonic_freq:  # tonic is already known;
			est_tonic = False
		else:
			est_tonic = True
			# take A4 as the dummy frequency value for cent conversion
			tonic_freq = 440

		if not (est_tonic or est_mode):
			print "Both tonic and mode are known!"
			return -1

		if(est_tonic and est_mode):
			neighbors = [ [mode_list, tonic_list] for i in range(len(chunk_data)) ]
		elif(est_tonic):
			neighbors = [ tonic_list for i in range(len(chunk_data)) ]
		elif(est_mode):
			neighbors = [ mode_list for i in range(len(chunk_data)) ]

		# chunk_estimate() generates the distributions of each chunk iteratively,
		# then compares it with all candidates and returns min_cnt closest neighbors
		# of each chunk to neighbors list.
		for p in range(len(pts)):
			neighbors[p] = self.chunk_estimate(pts[p], mode_names=mode_names,
			                                   mode_name=mode_name, mode_dir=mode_dir,
				                               est_tonic=est_tonic, est_mode=est_mode,
				                               distance_method=distance_method,
				                               metric=metric, ref_freq=tonic_freq,
				                               min_cnt=min_cnt,
				                               equalSamplePerMode = equalSamplePerMode)
		
		### TODO: Clean up the spaghetti decision making part. The procedures
		### are quite repetitive. Wrap them up with a separate function.

		# Temporary variables used during the desicion making part.
		candidate_distances, candidate_ests, candidate_sources, kn_distances, kn_ests, \
		kn_sources, idx_counts, elem_counts, res_distances, res_sources = ([] for i in range(10))

		# Joint estimation decision making. 
		if(est_mode and est_tonic):
			# Flattens the returned candidates and related data about them and
			# stores them into candidate_* variables. candidate_distances stores
			# the distance values, candidate_ests stores the mode/tonic pairs
			# candidate_sources stores the sources of the nearest neighbors.
			for i in xrange(len(pts)):
				for j in neighbors[i][1]:
					candidate_distances.append(j)
				for l in xrange(len(neighbors[i][0][1])):
					candidate_ests.append((neighbors[i][0][1][l], neighbors[i][0][0][l][0]))
					candidate_sources.append(neighbors[i][0][0][l][1])

			# Finds the nearest neighbors and fills all related data about
			# them to kn_* variables. Each of these variables have length k.
			# kn_distances stores the distance values, kn_ests stores
			# mode/tonic pairs, kn_sources store the name/id of the distribution
			# that gave rise to the corresponding distances.
			for k in xrange(k_param):
				idx = np.argmin(candidate_distances)
				kn_distances.append(candidate_distances[idx])
				kn_ests.append(candidate_ests[idx])
				kn_sources.append(candidate_sources[idx])
				candidate_distances[idx] = (np.amax(candidate_distances) + 1)
			
			# Counts the occurences of each candidate mode/tonic pair in
			# the K nearest neighbors. The result is our estimation. 
			for c in set(kn_ests):
				idx_counts.append(kn_ests.count(c))
				elem_counts.append(c)
			joint_estimation = elem_counts[np.argmax(idx_counts)]

			# We have concluded our estimation. Here, we retrieve the 
			# relevant data to this estimation; the sources and coresponding
			# distances.
			for m in xrange(len(kn_ests)):
				if (kn_ests[m] == joint_estimation):
					res_sources.append(kn_sources[m])
					res_distances.append(kn_distances[m])
			result = [joint_estimation, res_sources, res_distances]

		# Mode estimation decision making
		elif(est_mode):
			# Flattens the returned candidates and related data about them and
			# stores them into candidate_* variables. candidate_distances stores
			# the distance values, candidate_ests stores the candidate modes
			# candidate_sources stores the sources of the nearest neighbors.
			for i in xrange(len(pts)):
				for j in neighbors[i][1]:
					candidate_distances.append(j)
				for l in xrange(len(neighbors[i][0])):
					candidate_ests.append(neighbors[i][0][l][0])
					candidate_sources.append(neighbors[i][0][l][1])

			# Finds the nearest neighbors and fills all related data about
			# them to kn_* variables. Each of these variables have length k.
			# kn_distances stores the distance values, kn_ests stores
			# mode names, kn_sources store the name/id of the distributions
			# that gave rise to the corresponding distances.
			for k in xrange(k_param):
				idx = np.argmin(candidate_distances)
				kn_distances.append(candidate_distances[idx])
				kn_ests.append(candidate_ests[idx])
				kn_sources.append(candidate_sources[idx])
				candidate_distances[idx] = (np.amax(candidate_distances) + 1)

			# Counts the occurences of each candidate mode name in
			# the K nearest neighbors. The result is our estimation. 
			for c in set(kn_ests):
				idx_counts.append(kn_ests.count(c))
				elem_counts.append(c)
			mode_estimation = elem_counts[np.argmax(idx_counts)]

			# We have concluded our estimation. Here, we retrieve the 
			# relevant data to this estimation; the sources and coresponding
			# distances.
			for m in xrange(len(kn_ests)):
				if (kn_ests[m] == mode_estimation):
					res_sources.append(kn_sources[m])
					res_distances.append(kn_distances[m])
			result = [mode_estimation, res_sources, res_distances]


		# Tonic estimation decision making
		elif(est_tonic):
			# Flattens the returned candidates and related data about them and
			# stores them into candidate_* variables. candidate_distances stores
			# the distance values, candidate_ests stores the candidate peak 
			# frequencies, candidate_sources stores the sources of the nearest
			# neighbors.
			for i in xrange(len(pts)):
				for j in neighbors[i][1]:
					candidate_distances.append(j)
				for l in xrange(len(neighbors[i][0])):
					candidate_ests.append(neighbors[i][0][l][0])
					candidate_sources.append(neighbors[i][0][l][1])

			# Finds the nearest neighbors and fills all related data about
			# them to kn_* variables. Each of these variables have length k.
			# kn_distances stores the distance values, kn_ests stores
			# peak frequencies, kn_sources store the name/id of the
			# distributions that gave rise to the corresponding distances.
			for k in xrange(k_param):
				idx = np.argmin(candidate_distances)
				kn_distances.append(candidate_distances[idx])
				kn_ests.append(candidate_ests[idx])
				kn_sources.append(candidate_sources[idx])
				candidate_distances[idx] = (np.amax(candidate_distances) + 1)

			# Counts the occurences of each candidate tonic frequency in
			# the K nearest neighbors. The result is our estimation. 
			for c in set(kn_ests):
				idx_counts.append(kn_ests.count(c))
				elem_counts.append(c)
			tonic_estimation = elem_counts[np.argmax(idx_counts)]

			# We have concluded our estimation. Here, we retrieve the 
			# relevant data to this estimation; the sources and coresponding
			# distances.
			for m in xrange(len(kn_ests)):
				if (kn_ests[m] == tonic_estimation):
					res_sources.append(kn_sources[m])
					res_distances.append(kn_distances[m])
			result = [tonic_estimation, res_sources, res_distances]

		return result
Пример #7
0
	def estimate(self, pitch_file, mode_in='./', tonic_freq=None, rank=1,
	             distance_method="bhat", metric='pcd'):
		"""-------------------------------------------------------------------------
		This is the ultimate estimation function. There are three different types
		of estimations.

		1) Joint Estimation: Neither the tonic nor the mode of the recording is known.
		Then, joint estimation estimates both of these parameters without any prior
		knowledge about the recording.
		To use this: est_mode and est_tonic flags should be True since both are to
		be estimated. In this case tonic_freq and mode_name parameters are not used,
		since these are used to pass the annotated data about the recording.

		2) Tonic Estimation: The mode of the recording is known and tonic is to be
		estimated. This is generally the most accurate estimation among the three.
		To use this: est_tonic should be True and est_mode should be False. In this
		case tonic_freq  and mode_names parameters are not used since tonic isn't
		known a priori and mode is known and hence there is no candidate mode.

		3) Mode Estimation: The tonic of the recording is known and mode is to be
		estimated.
		To use this: est_mode should be True and est_tonic should be False. In this
		case mode_name parameter isn't used since the mode annotation is not
		available. It can be ignored.
		----------------------------------------------------------------------------
		pitch_file:     : File in which the pitch track of the input recording
						whose tonic and/or mode is to be estimated. 
		mode_in         : The mode input, If it is a filename or distribution object,
						the mode is treated as known and only tonic will be estimated.
						If a directory with the json files or dictionary of
						distributions (per mode) is given, the mode will be estimated.
						In case of directory, the modes will be taken as the json
						filenames.
		tonic_freq      : Annotated tonic of the recording. If it's unknown, we use
						an arbitrary value, so this can be ignored.
		rank            : The number of estimations expected from the system. If
						this is 1, estimation returns the most likely tonic, mode
						or tonic/mode pair. If it is n, it returns a sorted list
						of tuples of length n, each containing a tonic/mode pair. 
		distance_method : The choice of distance methods. See distance() in
						ModeFunctions for more information.
		metric          : Whether the model should be octave wrapped (Pitch Class
						Distribution: PCD) or not (Pitch Distribution: PD)
		-------------------------------------------------------------------------"""

		# load pitch track 
		pitch_track = np.loadtxt(pitch_file)

		# assume the first col is time, the second is pitch and the rest is labels etc.
		pitch_track = pitch_track[:,1] if pitch_track.ndim > 1 else pitch_track

		# parse mode input
		try:
			# list of json files per mode
			if all(os.path.isfile(m) for m in mode_in): 
				est_mode = True  # do mode estimation
				mode_names = [os.path.splitext(m)[0] for m in mode_in]
				models = [pD.load(m) for m in mode_in]
			elif os.path.isfile(mode_in): # json file
				est_mode = False # mode already known
				model = pD.load(mode_in)
		except TypeError:
			try:  # models
				if isinstance(mode_in, pD.PitchDistribution):
					# mode is loaded
					est_mode = False  # mode already known
					model = mode_in
				elif all(isinstance(m, pD.PitchDistribution) for m in mode_in.values()):
					# models of all modes are loaded
					est_mode = True  # do mode estimation
					mode_names = mode_in.keys()
					models = [mode_in[m] for m in mode_names]
			except:
				ValueError("Unknown mode input!")

		# parse tonic input
		if tonic_freq:  # tonic is already known;
			est_tonic = False
		else:
			est_tonic = True
			tonic_freq = 440  # take A4 as the dummy frequency value for cent conversion; it doesnt affect anything

		if not (est_tonic or est_mode):
			ValueError("Both tonic and mode are known!")

		# slice the pitch track if specified
		if self.chunk_size > 0:
			time_track = np.arange(0, self.frame_rate * len(pitch_track), self.frame_rate)
			pitch_track, segs = mF.slice(time_track, pitch_track, '', self.chunk_size)

		# normalize pitch track according to the given tonic frequency
		cent_track = mF.hz_to_cent(pitch_track, ref_freq=tonic_freq)

		# Pitch distribution of the input recording is generated
		distrib = mF.generate_pd(cent_track, ref_freq=tonic_freq, smooth_factor=self.smooth_factor,
		                         step_size=self.step_size)

		# convert to PCD, if specified
		distrib = mF.generate_pcd(distrib) if (metric == 'pcd') else distrib

		# Saved mode models are loaded and output variables are initiated
		tonic_ranked = [('', 0) for x in range(rank)]
		mode_ranked = [('', 0) for x in range(rank)]

		# Preliminary steps for tonic identification
		if est_tonic:
			if metric == 'pcd':
				# If there happens to be a peak at the last (and first due to the circular
				# nature of PCD) sample, it is considered as two peaks, one at the end and
				# one at the beginning. To prevent this, we find the global minima (as it
				# is easy to compute) of the distribution and make it the new reference
				# frequency, i.e. shift it to the beginning.
				shift_factor = distrib.vals.tolist().index(min(distrib.vals))
				distrib = distrib.shift(shift_factor)

				# update to the new reference frequency after shift
				tonic_freq = mF.cent_to_hz([distrib.bins[shift_factor]], ref_freq=tonic_freq)[0]

				# Find the peaks of the distribution. These are the tonic candidates.
				peak_idxs, peak_vals = distrib.detect_peaks()
			elif metric == 'pD':
				# Find the peaks of the distribution. These are the tonic candidates
				peak_idxs, peak_vals = distrib.detect_peaks()

				# The number of samples to be shifted is the list [peak indices - zero bin]
				# origin is the bin with value zero and the shifting is done w.r.t. it.
				origin = np.where(distrib.bins == 0)[0][0]
				shift_idxs = [(idx - origin) for idx in peak_idxs]

		# Joint Estimation
		if (est_tonic and est_mode):
			if (metric == 'pD'):
				# Since PD lengths aren't equal, we zero-pad the distributions for comparison
				# tonic_estimate() of ModeFunctions just does that. It can handle only
				# a single column, so the columns of the matrix are iteratively generated
				dist_mat = np.zeros((len(shift_idxs), len(models)))
				for m, model in enumerate(models):
					dist_mat[:, m] = mF.tonic_estimate(distrib, shift_idxs, model, distance_method=distance_method,
					                                   metric=metric, step_size=self.step_size)
			elif (metric == 'pcd'):
				# PCD doesn't require any preliminary steps. Generate the distance matrix.
				# The rows are tonic candidates and columns are mode candidates.
				dist_mat = mF.generate_distance_matrix(distrib, peak_idxs, models, method=distance_method)

			# Distance matrix is ready now. For each rank, (or each pair of
			# tonic-mode estimate pair) the loop is iterated. When the first
			# best estimate is found it's changed to the worst, so in the
			# next iteration, the estimate would be the second best and so on.
			for r in range(min(rank, len(peak_idxs))):
				# The minima of the distance matrix is found. This is when the
				# distribution is the most similar to a mode distribution, according
				# to the corresponding tonic estimate. The corresponding tonic
				# and mode pair is our current estimate.
				min_row = np.where((dist_mat == np.amin(dist_mat)))[0][0]
				min_col = np.where((dist_mat == np.amin(dist_mat)))[1][0]
				# Due to the precaution step of PCD, the reference frequency is
				# changed. That's why it's treated differently than PD. Here,
				# the cent value of the tonic estimate is converted back to Hz.
				if (metric == 'pcd'):
					tonic_ranked[r] = (mF.cent_to_hz([distrib.bins[peak_idxs[min_row]]],
					                                 tonic_freq)[0], dist_mat[min_row][min_col])
				elif (metric == 'pD'):
					tonic_ranked[r] = (mF.cent_to_hz([shift_idxs[min_row] * self.step_size],
					                                 tonic_freq)[0], dist_mat[min_row][min_col])
				# Current mode estimate is recorded.
				mode_ranked[r] = (mode_names[min_col], dist_mat[min_row][min_col])
				# The minimum value is replaced with a value larger than maximum,
				# so we won't return this estimate pair twice.
				dist_mat[min_row][min_col] = (np.amax(dist_mat) + 1)
			return mode_ranked, tonic_ranked

		# Tonic Estimation
		elif (est_tonic):
			# This part assigns the special case changes to standard variables,
			# so that we can treat PD and PCD in the same way, as much as
			# possible. 
			peak_idxs = shift_idxs if (metric == 'pD') else peak_idxs
			tonic_freq = tonic_freq if (metric == 'pcd') else tonic_freq

			# Distance vector is generated. In the mode_estimate() function
			# of ModeFunctions, PD and PCD are treated differently and it
			# handles the special cases such as zero-padding. The mode is
			# already known, so there is only one model to be compared. Each
			# entry corresponds to one tonic candidate.
			distance_vector = mF.tonic_estimate(distrib, peak_idxs, model, distance_method=distance_method,
			                                    metric=metric, step_size=self.step_size)

			# Distance vector is ready now. For each rank, the loop is iterated.
			# When the first best estimate is found it's changed to be the worst,
			# so in the next iteration, the estimate would be the second best
			# and so on
			for r in range(min(rank, len(peak_idxs))):
				# Minima is found, corresponding tonic candidate is our current
				# tonic estimate
				idx = np.argmin(distance_vector)
				# Due to the changed reference frequency in PCD's precaution step,
				# PCD and PD are treated differently here. 
				# TODO: review here, this might be tedious due to 257th line.
				if (metric == 'pcd'):
					tonic_ranked[r] = (mF.cent_to_hz([distrib.bins[peak_idxs[idx]]],
					                                 tonic_freq)[0], distance_vector[idx])
				elif (metric == 'pD'):
					tonic_ranked[r] = (mF.cent_to_hz([shift_idxs[idx] * self.step_size],
					                                 tonic_freq)[0], distance_vector[idx])
				# Current minima is replaced with a value larger than maxima,
				# so that we won't return the same estimate twice.
				distance_vector[idx] = (np.amax(distance_vector) + 1)
			return tonic_ranked

		# Mode Estimation
		elif (est_mode):
			# Distance vector is generated. Again, mode_estimate() of
			# ModeFunctions handles the different approach required for
			# PCD and PD. Since tonic is known, the distributions aren't
			# shifted and are only compared to candidate mode models.
			distance_vector = mF.mode_estimate(distrib, models, distance_method=distance_method, metric=metric,
			                                   step_size=self.step_size)

			# Distance vector is ready now. For each rank, the loop is iterated.
			# When the first best estimate is found it's changed to be the worst,
			# so in the next iteration, the estimate would be the second best
			# and so on
			for r in range(min(rank, len(mode_names))):
				# Minima is found, corresponding mode candidate is our current
				# mode estimate
				idx = np.argmin(distance_vector)
				mode_ranked[r] = (mode_names[idx], distance_vector[idx])
				# Current minima is replaced with a value larger than maxima,
				# so that we won't return the same estimate twice.
				distance_vector[idx] = (np.amax(distance_vector) + 1)
			return mode_ranked

		else:
			# Nothing is expected to be estimated.
			return 0