예제 #1
0
 def whole_model(**kwargs):
     read(kwargs['link'], kwargs['input_dim'])
     _, _, _, _, auto_runtime, auto_err = \
         autoencoder(kwargs['epoch'], kwargs['batch'], kwargs['latent'],
                     kwargs['encoder_o'], kwargs['encoder_i'], kwargs['decoder_i'],
                     kwargs['decoder_o'], kwargs['train_percent'], kwargs['lam'],
                     kwargs['norm_order'], kwargs['loss_plot'])
     _, svm_runtime, svm_err = classify(kwargs['gamma'], kwargs['c'],
                                        kwargs['train_percent'])
     return auto_runtime, auto_err, svm_runtime, svm_err
예제 #2
0
파일: main.py 프로젝트: maxsond/Hail
def intro2():
	display.clear(display.lwin)
	display.msg("./program",False)
	display.clear(display.rwin)
	display.inp()
	display.clear(display.lwin)
	display.msg("Hail, Program.",False)
	display.clear(display.lwin)
	display.msg("Would you kindly open the pod bay doors?",False)
	display.clear(display.rwin)
	display.inp()
	display.clear(display.lwin)
	display.msg("Oh, right, you're the new generation.",False)
	display.clear(display.lwin)
	display.msg("First, you need to access the airlock.",False)
	display.clear(display.lwin)
	display.msg("Just type 'cr airlock' to access the airlock systems.",False)
	display.clear(display.rwin)
	parse.read(display.inp())
def run():
    debug = Trace(debugLevel)
    if debugLevel >= 1:
        debug.writeTrace()
    #Check if command line call or not
    if len(sys.argv) > 1:
        fname = sys.argv[1]     #File to analyse is passed as the first argument
    else:
        #Backup file for testing purposes
        fname = r"C:\Users\jordan\Documents\GitHub\javaParser\SampleJavaFiles\ToThePowerOf.java"
    parent = parse.read(fname, debug)
    #walker.getNodeCount(parent)
    #parse.printTree(parent)
    recursions, loops = scan.detect(parent, debug)
    scan.output(recursions, loops)
예제 #4
0
def populate():
    cols, rows = parse.read()

    with transaction.atomic():

        all_votes = []
        for row in rows:
            kraj, _ = Kraj.objects.get_or_create(name='Polska')
            wojewodztwo, _ = Wojewodztwo.objects.get_or_create(name=row[0])
            powiat, _ = Powiat.objects.get_or_create(name=row[4],
                                                     wojewodztwo=wojewodztwo)
            gmina, _ = Gmina.objects.get_or_create(name=row[3],
                                                   code=row[2],
                                                   powiat=powiat)

            for i in range(11, 23):
                candidate, _ = Candidate.objects.get_or_create(name=cols[i])
                vote = constructVote(wojewodztwo, powiat, gmina, candidate,
                                     row[i])
                all_votes.append(vote)

            voters = int(row[6])
            ballots = int(row[7])

            gmina.voters += voters
            gmina.ballots += ballots

            powiat.voters += voters
            powiat.ballots += ballots

            wojewodztwo.voters += voters
            wojewodztwo.ballots += ballots

            kraj.voters += voters
            kraj.ballots += ballots

            gmina.save()
            powiat.save()
            wojewodztwo.save()
            kraj.save()

            print(len(all_votes))

        Vote.objects.bulk_create(all_votes)

    print(Vote.objects.all())
예제 #5
0
def populate():
    cols, rows = parse.read()

    with transaction.atomic():

        all_votes = []
        for row in rows:
            kraj, _ = Kraj.objects.get_or_create(name = 'Polska')
            wojewodztwo, _ = Wojewodztwo.objects.get_or_create(name = row[0])
            okreg, _ = Okreg.objects.get_or_create(name = row[1], wojewodztwo = wojewodztwo)
            gmina, _ = Gmina.objects.get_or_create(name = row[3], code = row[2], okreg = okreg)

            for i in range(11, 23):
                candidate, _ = Candidate.objects.get_or_create(name = cols[i])
                vote = constructVote(wojewodztwo, okreg, gmina, candidate, row[i])
                all_votes.append(vote)

            max_votes = int(row[6])
            valid_votes = int(row[7])

            gmina.max_votes += max_votes
            gmina.valid_votes += valid_votes

            okreg.max_votes += max_votes
            okreg.valid_votes += valid_votes

            wojewodztwo.max_votes += max_votes
            wojewodztwo.valid_votes += valid_votes

            kraj.max_votes += max_votes
            kraj.valid_votes += valid_votes

            gmina.save()
            okreg.save()
            wojewodztwo.save()
            kraj.save()

            print(len(all_votes))

        Vote.objects.bulk_create(all_votes)

    print(Vote.objects.all())
예제 #6
0
def main():
    args = parse.set()
    config = Config()

    address = parse.read(args, config)

    if not config.api_key:
        if not config.api_key:
            print("API key not found, run again with --key apikey")
            return

    if (address):
        location = Location(address, config.location_url)
        if (location.address):
            weather = Weather(location, config)
            display.show(location, weather)
        return

    if not args.key:
        print('No address supplied')
예제 #7
0
#files+= ["gap4_2.txt.lp"]
#files = [f for f in listdir(directory) if isfile(join(directory, f))]

files += ['/home/'+username+'/Desktop/instances/toTest/new46.lp']	
files += ['/home/'+username+'/Desktop/instances/toTest/new47.lp']	
files += ['/home/'+username+'/Desktop/instances/miplib/fiber.mps']
files += ['/home/'+username+'/Desktop/instances/miplib/10teams.mps']
files += ['/home/'+username+'/Desktop/instances/miplib/rout.mps']
files += ['/home/'+username+'/Desktop/instances/miplib/noswot.mps']
files += ['/home/'+username+'/Desktop/instances/miplib/modglob.mps']
files += ['/home/'+username+'/Desktop/instances/miplib/gesa2.mps']
files += ['/home/'+username+'/Desktop/instances/miplib/vpm2.mps']
files += ['/home/'+username+'/Desktop/instances/ORLib/airland/airland1R2.mps']

for file in files:
	if not (file.endswith('.mps') or file.endswith('.lp')):
		continue
	#path = directory + file
	path = file
	data, row_names = parse.read(path)  
	sim_matrix = pp.strategy(data, 'sim', 2)
	print 'Occupancy: ', len(sim_matrix.nonzero()[0]) / (sim_matrix.shape[0] * sim_matrix.shape[0])
	#save_matrix_file(dist_matrix, directory, file+'dist_mat')
	#save_matrix_fig(data, directory, file)
	print file, data.shape
	gc.collect()
	#print 'dist_matrix.shape', dist_matrix.shape
	print '-----------------------------------------------'
	
	
	
예제 #8
0
def agg(instance_path, res_folder, strategy = 2):
	instances = instance_path.rsplit('/', 1)[0] + '/'
	file = instance_path.rsplit('/', 1)[1]
	input_type  = '.' + file.rsplit('.', 1)[1]
	file = file.rsplit('.', 1)[0]
	data, row_names = parse.read(instances + file + input_type)
	print 'Size of data matrix: ', data.shape
	if len(data) <> len(row_names):
		print 'DBSCAN error: data and row_names have diff. lens', len(data), len(row_names)	
	#save_matrix_fig(data, res_folder, file+'_in')
	dist_matrix = []
	# 	
	try:
		dist_matrix = scipy.io.mmread(res_folder+file+'_dist'+str(strategy)).toarray()
		print 'Distance matrix %s found.' %(res_folder+file+'_dist'+str(strategy))
	except:
		print 'Distance matrix %s NOT found!!!!' %(res_folder+file+'_dist'+str(strategy))
		dist_matrix = pp.strategy(data, 'distance',strategy)	
		scipy.io.mmwrite(res_folder+file+'_dist'+str(strategy),dist_matrix)
		dist_matrix = dist_matrix.toarray()

	dist_matrix[dist_matrix == 0] = 1000 #float('inf')
	for i in range(dist_matrix.shape[0]):
		for j in range(i+1):
			dist_matrix[i,j] = 0

	old_n_clusters = 0
	old_non_clustered = 0

	# list to save labels from all iterations, so we can later pick the best clustering
	res_from_diff_params = {}
	nr_clusters_from_diff_params = {}
	non_clustered_from_diff_params = {}
	distribution_from_diff_params = {}
	best_iteration = -1
	sec_best_iteration = -1
	n = dist_matrix.shape[0]
	min_non_clusterd = n
	s_min_non_clusterd = n
	max_std_dev = n
	sec_threshold = 0.0001
	n_iterations = 20  		# must be an odd number 

	
	# cluster the data with hiararhical ---------------------------------------------


	print 'Running Agglomerative Clustering...'
	z = hierarchy.linkage(dist_matrix, method='complete')
	knee = np.diff(z[::-1, 2], 2)
	print 'z = ', z

	max_n_cl = []
	# find the knees
	for i in range(n_iterations):
		temp_n_cl = knee.argmax() + 2
		knee[knee.argmax()] = 0
		if temp_n_cl < 0.5*n:
			print 'temp_n_cl = ', temp_n_cl
			max_n_cl.append(temp_n_cl)

	max_n_cl = np.unique(max_n_cl)

	for iteration in range(len(max_n_cl)):

		print 'iteration = ', iteration
		labels = hierarchy.fcluster(z, max_n_cl[iteration], 'maxclust')
		# we need this because algorithm returns labels starting from 1, but we want them to start from 0
		labels = np.array([label-1 for label in labels])

		n_clusters = len(set(labels)) - (1 if -1 in labels else 0)


		num_per_cluster = {}
		for i in range(n_clusters):
			num_per_cluster[i] = 0

		for label in labels:
			for i in range(n_clusters):
				if label == i:
					num_per_cluster[i] += 1
		
		# ---------------------------------------------------------------------------------------
		# display some information
		print 'Estimated number of clusters: ',  n_clusters		
		print 'Number of points per cluster: ', num_per_cluster
		#draw(A=dist_matrix, colors=labels)
		# ---------------------------------------------------------------------------------------
		sorted_data, sotred_labels, sorted_names, column_labels = sort_matrix(data, labels, row_names)

		# pull down the points which have non-zero value that colides with points from other clusters
		sorted_data2, sotred_labels2, sorted_names2 = postp.remove_colision_points2(sorted_data, 
																		sotred_labels, sorted_names, column_labels)

		num_per_cluster = {}
		n_clusters = len(set(sotred_labels2)) - (1 if -1 in sotred_labels2 else 0)
		if -1 in sotred_labels2:
			all_clusters_list = range(-1, n_clusters)
		else:
			all_clusters_list = range(n_clusters)

		for i in all_clusters_list:
			num_per_cluster[i] = 0

		for label in sotred_labels2:
			for i in all_clusters_list:
				if label == i:
					num_per_cluster[i] += 1; 
		non_clustered = 0;			
		for label in sotred_labels2:
			if label == -1:
				non_clustered += 1	

		if n_clusters == 1 and non_clustered == 0:
			continue
		print 'Estimated number of clusters after removal: ',  n_clusters
		print 'Number of points per cluster after removal: ', num_per_cluster
		print 'Number of non clustered points after removal:', non_clustered
		if 0 in num_per_cluster.values():
			print 'TIME TO DEBUG:'
			print 'sotred_labels2 = ', sotred_labels2

		# save picture of end matrix
		#save_matrix_fig(sorted_data, res_folder, file + '_A_dec' +  str(iteration))
		#if res2_folder <> 'none':
			#save_matrix_fig(sorted_data2, res2_folder, file + '_A_dec' +  str(iteration))
		# find the best iteration, so we only save the best one --------------------------
		label_name_pairs = zip(sotred_labels2, sorted_names2)
		if non_clustered < min_non_clusterd:
			res_from_diff_params[iteration] = label_name_pairs
			nr_clusters_from_diff_params[iteration] = n_clusters
			non_clustered_from_diff_params[iteration] = non_clustered
			distribution_from_diff_params[iteration] = num_per_cluster
			min_non_clusterd = non_clustered
			if n_clusters > 1:
				second_best = iteration
			best_iteration = iteration
			print 'this is best iteration currently'
		
		# find the best iteration (according variance of cluster sizes), ----------------
		# so we only save the best one 
		temp_num_per_cluster = num_per_cluster.copy()
		if -1 in temp_num_per_cluster.keys():
			del temp_num_per_cluster[-1]
		if len(temp_num_per_cluster.values()) > 1:
			std_dev = np.std(temp_num_per_cluster.values())	
			mean = np.mean(temp_num_per_cluster.values())	
			rel_std_dev = std_dev / mean
			rel_std_dev *= pow(non_clustered/n, 2)
			print 'DEBUG: adjusted rel_std_dev = ', rel_std_dev
			std_dev = rel_std_dev
			# we accept the iteration if adjusted rel_std_dev is smaller, or 
			# if it is within the threshold and number of nonclustered points is smaller
			if (std_dev - max_std_dev) <= sec_threshold and non_clustered < s_min_non_clusterd:
				sec_criteria_fulfiled = True
			else:
				sec_criteria_fulfiled = False
			if std_dev < max_std_dev or sec_criteria_fulfiled:
				res_from_diff_params[iteration] = label_name_pairs
				nr_clusters_from_diff_params[iteration] = n_clusters
				non_clustered_from_diff_params[iteration] = non_clustered
				distribution_from_diff_params[iteration] = num_per_cluster
				max_std_dev = std_dev
				s_min_non_clusterd = non_clustered
				sec_best_iteration = iteration
				print 'this is second best iteration currently'		
		print '_______________________________________________________'
	# ----------------------------------------------------------------------------------
	
				

	best_found = False	
	best_n_clusters = 0
	best_non_clusterd = data.shape[0]
	best_distro = {-1:data.shape[0]}
	best_dec = ''	# name of dec file for best iteration

	s_best_found = False
	s_best_n_clusters = 0
	s_best_non_clusterd = data.shape[0]
	s_best_distro = {-1:data.shape[0]}
	s_dec = ''		# name of dec file for second best iteration

	# save .dec from best iteration
	print 'best_iteration= ', best_iteration
	print 'sec best iteration = ', sec_best_iteration
	if best_iteration >= 0:
		best_found = True
		best_n_clusters = nr_clusters_from_diff_params[best_iteration]
		best_non_clusterd = non_clustered_from_diff_params[best_iteration]
		best_distro = distribution_from_diff_params[best_iteration]
		best_dec = file + '_agg_' + str(best_n_clusters) + '_' + str(best_non_clusterd)
		dec.write(path = res_folder, filename = best_dec, label_name_pairs = res_from_diff_params[best_iteration])
		print '.dec file %s for iteration %i saved.' %(res_folder+best_dec, best_iteration)
	if sec_best_iteration >= 0:
		if sec_best_iteration <> best_iteration:
			s_best_found = True
		s_best_n_clusters = nr_clusters_from_diff_params[sec_best_iteration]
		s_best_non_clusterd = non_clustered_from_diff_params[sec_best_iteration]
		s_best_distro = distribution_from_diff_params[sec_best_iteration]
		s_dec = file + '_aggSTD_' + str(s_best_n_clusters) + '_' + str(s_best_non_clusterd)
		dec.write(path = res_folder, filename = s_dec, label_name_pairs = res_from_diff_params[sec_best_iteration])
		print '.dec file %s for iteration %i saved.' %(res_folder+s_dec, sec_best_iteration)	
	print '_______________________________________________________'
	print '_______________________________________________________'	

	gc.collect()
	return best_found, best_n_clusters, best_non_clusterd, best_distro, best_dec, data.shape[0], \
			s_best_found, s_best_n_clusters, s_best_non_clusterd, s_best_distro, s_dec
def affProp(instance_path, res_folder, strategy = 2):
	instances = instance_path.rsplit('/', 1)[0] + '/'
	file = instance_path.rsplit('/', 1)[1]
	input_type  = '.' + file.rsplit('.', 1)[1]
	file = file.rsplit('.', 1)[0]
	data, row_names = parse.read(instances + file + input_type)
	print 'Size of data matrix: ', data.shape
	if len(data) <> len(row_names):
		print 'Af prop error: data and row_names have diff. lens', len(data), len(row_names)	
	#save_matrix_fig(data, res_folder, file+'_in')
	sim_matrix = []
	# 	
	try:
		sim_matrix = np.load(res_folder+file+'_sim'+str(strategy)+'.npy')
		print 'Sim matrix %s found.' %(res_folder+file+'_sim'+str(strategy)+'.npy')
	except:
		print 'Sim matrix %s NOT found!!!!' %(res_folder+file+'_sim'+str(strategy)+'.npy')
		sim_matrix = pp.strategy(data, 'sim',strategy)	
		np.save(res_folder+file+'_sim'+str(strategy), sim_matrix)

	old_n_clusters = 0
	old_non_clustered = 0

	# list to save labels from all iterations, so we can later pick the best clustering
	res_from_diff_params = {}
	nr_clusters_from_diff_params = {}
	non_clustered_from_diff_params = {}
	distribution_from_diff_params = {}
	best_iteration = -1
	sec_best_iteration = -1
	n = sim_matrix.shape[0]
	min_non_clusterd = n
	s_min_non_clusterd = n
	max_std_dev = n
	sec_threshold = 0.0001
	n_iterations = 20  		# must be an odd number 

	sim_matrix[sim_matrix == 0] = -1e10
	#min_preferance = 0
	#min_preferance *= np.max(sim_matrix[sim_matrix > 0])
	min_preferance = np.min(sim_matrix[sim_matrix > 0]) -10
	max_preferance = np.median(sim_matrix[sim_matrix > 0])
	print 'min_preferance, ', min_preferance
	print 'max_preferance, ', max_preferance
	
	if min_preferance > max_preferance:
		raise Exception('Something is wrong with preferance setting: %d %d', 
			min_preferance, max_preferance)
	elif min_preferance == max_preferance:
		n_iterations = 1
		pref_list = [min_preferance]
	
	pref_step = (max_preferance-min_preferance) / n_iterations

	# cluster the data with DBSCAN ---------------------------------------------
	for iteration in range(n_iterations):
		
		if iteration == 0:
			preference = min_preferance
		else:
			preference += pref_step
		labels = []
		print '_______________________________________________________'
		print 'Aff. Prop. with preferance =', preference
		
		_, labels = affinity_propagation(sim_matrix, preference=preference)
		n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
		
		num_per_cluster = {}
		for i in range(n_clusters):
			num_per_cluster[i] = 0

		for label in labels:
			for i in range(n_clusters):
				if label == i:
					num_per_cluster[i] += 1; 


		# TODO: criteria for skiping or breaking the loop ---------------------------------------------
		# skip the iteration if the number of clusters is as before
		if iteration == 0:
			old_n_clusters = n_clusters
		#elif n_clusters >= old_n_clusters:
		#	break
		old_n_clusters = n_clusters
		# increase the preferance 
		if n_clusters == 1:
			print 'DEBUG: Aff prop. n_clusters == 1, going to next iteration'
			min_preferance = preference
			max_preferance += (max_preferance - min_preferance) / 2
			pref_step = (max_preferance-min_preferance) / (n_iterations-iteration)
			print 'min = %f, max = %f, step = %f' %(min_preferance, max_preferance, pref_step)
			continue
		# lower the preferance	
		if n_clusters >= 0.1*n:
			print 'DEBUG: Aff prop. n_clusters = %i, TOO HIGH!!!' %n_clusters
			max_preferance = preference
			min_preferance = preference - pref_step
			pref_step = (max_preferance-min_preferance) / (n_iterations-iteration)
			print 'min = %f, max = %f, step = %f' %(min_preferance, max_preferance, pref_step)
			continue	
		# ---------------------------------------------------------------------------------------
		# display some information
		print 'Estimated number of clusters: ',  n_clusters		
		print 'Number of points per cluster: ', num_per_cluster
		#draw(A=sim_matrix, colors=labels)
		# ---------------------------------------------------------------------------------------
		sorted_data, sotred_labels, sorted_names, column_labels = sort_matrix(data, labels, row_names)
		#print 'DEBUG:'
		#print 'column_labels = ', column_labels
		#print 'sotred_labels = ', sotred_labels
		#save_matrix_fig(sorted_data, res_folder, file + '_B_dec' +  str(iteration))

		# pull down the points which have non-zero value that colides with points from other clusters
		sorted_data2, sotred_labels2, sorted_names2 = postp.remove_colision_points2(sorted_data, 
																		sotred_labels, sorted_names, column_labels)

		num_per_cluster = {}
		n_clusters = len(set(sotred_labels2)) - (1 if -1 in sotred_labels2 else 0)
		if -1 in sotred_labels2:
			all_clusters_list = range(-1, n_clusters)
		else:
			all_clusters_list = range(n_clusters)

		for i in all_clusters_list:
			num_per_cluster[i] = 0

		for label in sotred_labels2:
			for i in all_clusters_list:
				if label == i:
					num_per_cluster[i] += 1; 
		non_clustered = 0;			
		for label in sotred_labels2:
			if label == -1:
				non_clustered += 1	
		print 'Estimated number of clusters after removal: ',  n_clusters
		print 'Number of points per cluster after removal: ', num_per_cluster
		print 'Number of non clustered points after removal:', non_clustered
		if 0 in num_per_cluster.values():
			print 'TIME TO DEBUG:'
			print 'sotred_labels2 = ', sotred_labels2

		# save picture of end matrix
		#save_matrix_fig(sorted_data2, res_folder, file + '_A_dec' +  str(iteration))
		#if res2_folder <> 'none':
			#save_matrix_fig(sorted_data2, res2_folder, file + '_A_dec' +  str(iteration))
		# find the best iteration, so we only save the best one --------------------------
		label_name_pairs = zip(sotred_labels2, sorted_names2)
		if non_clustered < min_non_clusterd:
			res_from_diff_params[iteration] = label_name_pairs
			nr_clusters_from_diff_params[iteration] = n_clusters
			non_clustered_from_diff_params[iteration] = non_clustered
			distribution_from_diff_params[iteration] = num_per_cluster
			min_non_clusterd = non_clustered
			if n_clusters > 1:
				second_best = iteration
			best_iteration = iteration
			print 'this is best iteration currently'
		
		# find the best iteration (according variance of cluster sizes), ----------------
		# so we only save the best one 
		temp_num_per_cluster = num_per_cluster.copy()
		if -1 in temp_num_per_cluster.keys():
			del temp_num_per_cluster[-1]
		if len(temp_num_per_cluster.values()) > 1:
			std_dev = np.std(temp_num_per_cluster.values())	
			mean = np.mean(temp_num_per_cluster.values())	
			rel_std_dev = std_dev / mean
			rel_std_dev *= pow(non_clustered/n, 2)
			print 'DEBUG: adjusted rel_std_dev = ', rel_std_dev
			std_dev = rel_std_dev
			# we accept the iteration if adjusted rel_std_dev is smaller, or 
			# if it is within the threshold and number of nonclustered points is smaller
			if (std_dev - max_std_dev) <= sec_threshold and non_clustered < s_min_non_clusterd:
				sec_criteria_fulfiled = True
			else:
				sec_criteria_fulfiled = False
			if std_dev < max_std_dev or sec_criteria_fulfiled:
				res_from_diff_params[iteration] = label_name_pairs
				nr_clusters_from_diff_params[iteration] = n_clusters
				non_clustered_from_diff_params[iteration] = non_clustered
				distribution_from_diff_params[iteration] = num_per_cluster
				max_std_dev = std_dev
				s_min_non_clusterd = non_clustered
				sec_best_iteration = iteration
				print 'this is second best iteration currently'		
		# ----------------------------------------------------------------------------------
		print '_______________________________________________________'
				

	best_found = False	
	best_n_clusters = 0
	best_non_clusterd = data.shape[0]
	best_distro = {-1:data.shape[0]}
	best_dec = ''	# name of dec file for best iteration

	s_best_found = False
	s_best_n_clusters = 0
	s_best_non_clusterd = data.shape[0]
	s_best_distro = {-1:data.shape[0]}
	s_dec = ''		# name of dec file for second best iteration

	# save .dec from best iteration
	print 'best_iteration= ', best_iteration
	print 'sec best iteration = ', sec_best_iteration
	if best_iteration >= 0:
		best_found = True
		best_n_clusters = nr_clusters_from_diff_params[best_iteration]
		best_non_clusterd = non_clustered_from_diff_params[best_iteration]
		best_distro = distribution_from_diff_params[best_iteration]
		best_dec = file + '_affProp_' + str(best_n_clusters) + '_' + str(best_non_clusterd)
		dec.write(path = res_folder, filename = best_dec, label_name_pairs = res_from_diff_params[best_iteration])
		print '.dec file %s for iteration %i saved.' %(res_folder+best_dec, best_iteration)
	if sec_best_iteration >= 0:
		if sec_best_iteration <> best_iteration:
			s_best_found = True
		s_best_n_clusters = nr_clusters_from_diff_params[sec_best_iteration]
		s_best_non_clusterd = non_clustered_from_diff_params[sec_best_iteration]
		s_best_distro = distribution_from_diff_params[sec_best_iteration]
		s_dec = file + '_affPropSTD_' + str(s_best_n_clusters) + '_' + str(s_best_non_clusterd)
		dec.write(path = res_folder, filename = s_dec, label_name_pairs = res_from_diff_params[sec_best_iteration])
		print '.dec file %s for iteration %i saved.' %(res_folder+s_dec, sec_best_iteration)	
	print '_______________________________________________________'
	print '_______________________________________________________'	
	gc.collect()
	return best_found, best_n_clusters, best_non_clusterd, best_distro, best_dec, data.shape[0], \
			s_best_found, s_best_n_clusters, s_best_non_clusterd, s_best_distro, s_dec
예제 #10
0
파일: align.py 프로젝트: sbeschke/amr_ud
import optparse
from alignment import read_neg_polarity_items, get_full_alignment, print_alignment
from parse import read_in_fixed_parses, read, read_and_parse, write_out_parses

parser = optparse.OptionParser()
parser.add_option('-p', '--parses', dest="ud_file", help="file with UD parses")
parser.add_option('-a', '--amrs', dest="amr_file", help="file with AMRs")
parser.add_option('-o', '--output', dest="output_file", help="alignment file")
parser.add_option(
    '-w',
    '--write_out',
    dest="write_out_ud",
    default="",
    help="file to which write UD parses; if none, don't write out")
(opts, _) = parser.parse_args()

if opts.ud_file:
    sentences = read_in_fixed_parses(read(opts.amr_file), opts.ud_file)
else:
    sentences = read_and_parse(opts.amr_file)
    if opts.write_out_ud:
        write_out_parses(sentences, opts.write_out_ud)
neg_dict = read_neg_polarity_items('neg-polarity.txt')
alignments = get_full_alignment(sentences, neg_dict)
print_alignment(alignments, opts.output_file)
예제 #11
0
def dbscan(instance_path, res_folder, strategy = 2):
	instances = instance_path.rsplit('/', 1)[0] + '/'
	file = instance_path.rsplit('/', 1)[1]
	input_type  = '.' + file.rsplit('.', 1)[1]
	file = file.rsplit('.', 1)[0]
	data, row_names = parse.read(instances + file + input_type)
	print 'Size of data matrix: ', data.shape
	if len(data) <> len(row_names):
		print 'DBSCAN error: data and row_names have diff. lens', len(data), len(row_names)	
	#save_matrix_fig(data, res_folder, file+'_in')
	dist_matrix = []
	try:
		dist_matrix = scipy.io.mmread(res_folder+file+'_dist'+str(strategy)).tocsr()
		print 'Distance matrix %s found.' %(res_folder+file+'_dist'+str(strategy))
	except:
		print 'Distance matrix %s NOT found!!!!' %(res_folder+file+'_dist'+str(strategy))
		dist_matrix = pp.strategy(data, 'distance',strategy)	
		scipy.io.mmwrite(res_folder+file+'_dist'+str(strategy),dist_matrix)	

	# this part is important for BPP-like instances!!! ---------------------------------------	
	'''if check_if_BPP_like(dist_matrix):
		bpp_like = all(True if x == 0 or x == 1 else False for x in np.nditer(dist_matrix))
	else:
		bpp_like = False'''
	bpp_like = False

	occupancy = len(dist_matrix.data) / (dist_matrix.shape[0] * dist_matrix.shape[1]) * 100
	q = 10
	dist_percentile = np.percentile(a=dist_matrix.data, q=q, axis=None)
	print 'dist_percentile = ', dist_percentile
	if dist_percentile == 0: # or strategy == 6:
		q = 1
		print 'Recalculating dist_percentile..'
		#dist_percentile = np.percentile(a=dist_matrix, q=q)
		dist_percentile = np.percentile(a=dist_matrix.data, q=q, axis=None)

	print 'dist_percentile = ', dist_percentile
	old_n_clusters = 0
	old_non_clustered = 0

	# list to save labels from all iterations, so we can later pick the best clustering
	res_from_diff_params = {}
	nr_clusters_from_diff_params = {}
	non_clustered_from_diff_params = {}
	distribution_from_diff_params = {}
	best_iteration = -1
	sec_best_iteration = -1
	n = dist_matrix.shape[0]
	min_non_clusterd = n
	s_min_non_clusterd = n
	min_std_dev = n
	sec_threshold = 0.0001
	n_iterations = 49  		# must be an odd number 
	eps_list = get_eps_list(mid=dist_percentile, length=n_iterations, strategy=strategy)

	print 'eps_list = ', eps_list 

	# cluster the data with DBSCAN ---------------------------------------------
	for iteration in range(n_iterations):
		gc.collect()
		if dist_percentile == 0 and not bpp_like:
			print 'dist_percentile = %i, -> we cannot use DBSCAN for clustering this instance.' %dist_percentile
			break
		# eps is in range: [dist_percentile - 0.5, dist_percentile + 0.5] but with geometric progression
		eps = eps_list[iteration]

		if eps <= 0 and not bpp_like:
			continue
		if eps >= 1 and not bpp_like:
			break	
		# for distance strategy 1: 0.054...
		#eps = 0.1 + (iteration / 10) 
		min_samples = 4
		#print 'DEBUG: eps = ', eps
		labels = []
		print '_______________________________________________________'
		print 'iteration= ', iteration
		print 'eps = ', eps
		print 'min_samples = ', min_samples
		if bpp_like:
			print 'DEBUG: Running getLabelsFrom01Dist()'
			labels = getLabelsFrom01Dist(dist_matrix)
		else:
			print 'Running DBSCAN...'
			try:
				db = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed').fit(dist_matrix)
			except ValueError:
				print 'Value error occured in DBSCAN. Stop.'
				raise
				break	
			labels = db.labels_
			

		n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
		
		num_per_cluster = {}
		for i in range(n_clusters):
			num_per_cluster[i] = 0

		for label in labels:
			for i in range(n_clusters):
				if label == i:
					num_per_cluster[i] += 1; 
		non_clustered = 0;			
		for label in labels:
			if label == -1:
				non_clustered += 1	
		
		# criteria for skiping or breaking the loop ---------------------------------------------
		# skip the iteration if the number of clusters is as before
		if iteration == 0:
			old_n_clusters = n_clusters
			old_non_clustered = non_clustered
		if n_clusters == old_n_clusters and non_clustered == old_non_clustered and iteration > 0:
			continue
		old_n_clusters = n_clusters
		old_non_clustered = non_clustered
		if n_clusters == 1 and non_clustered == 0:
			print 'Stopping because bigger EPS will be the same.'
			break
		# ---------------------------------------------------------------------------------------
		# display some information
		print 'Estimated number of clusters: ',  n_clusters		
		print 'Number of points per cluster: ', num_per_cluster
		print 'Number of non clustered points:', non_clustered
		#draw(A=sim_matrix, colors=labels)
		# ---------------------------------------------------------------------------------------
		sorted_data, sotred_labels, sorted_names, column_labels = sort_matrix(data, labels, row_names)
		#print 'DEBUG:'
		#print 'column_labels = ', column_labels
		#print 'sotred_labels = ', sotred_labels
		#save_matrix_fig(sorted_data, res_folder, file + '_B_dec' +  str(iteration))

		# pull down the points which have non-zero value that colides with points from other clusters
		sorted_data2, sotred_labels2, sorted_names2 = postp.remove_colision_points2(sorted_data, 
																				sotred_labels, sorted_names, column_labels)

		num_per_cluster = {}
		n_clusters = len(set(sotred_labels2)) - (1 if -1 in sotred_labels2 else 0)
		if -1 in sotred_labels2:
			all_clusters_list = range(-1, n_clusters)
		else:
			all_clusters_list = range(n_clusters)

		for i in all_clusters_list:
			num_per_cluster[i] = 0

		for label in sotred_labels2:
			for i in all_clusters_list:
				if label == i:
					num_per_cluster[i] += 1; 
		non_clustered = 0;			
		for label in sotred_labels2:
			if label == -1:
				non_clustered += 1	
		print 'Estimated number of clusters after removal: ',  n_clusters
		print 'Number of points per cluster after removal: ', num_per_cluster
		print 'Number of non clustered points after removal:', non_clustered
		if 0 in num_per_cluster.values():
			print 'TIME TO DEBUG:'
			print 'sotred_labels2 = ', sotred_labels2

		# save picture of end matrix
		#save_matrix_fig(sorted_data2, res_folder, file + '_A_dec' +  str(iteration))
		#if res2_folder <> 'none':
			#save_matrix_fig(sorted_data2, res2_folder, file + '_A_dec' +  str(iteration))
		# find the best iteration, so we only save the best one --------------------------
		label_name_pairs = zip(sotred_labels2, sorted_names2)
		if non_clustered < min_non_clusterd:
			res_from_diff_params[iteration] = label_name_pairs
			nr_clusters_from_diff_params[iteration] = n_clusters
			non_clustered_from_diff_params[iteration] = non_clustered
			distribution_from_diff_params[iteration] = num_per_cluster
			min_non_clusterd = non_clustered
			if n_clusters > 1:
				second_best = iteration
			best_iteration = iteration
			print 'this is best iteration currently'
		if bpp_like:
			print 'This instance was BPP-like.'
			break;	
		# find the best iteration (according variance of cluster sizes), ----------------
		# so we only save the best one 
		temp_num_per_cluster = num_per_cluster.copy()
		if -1 in temp_num_per_cluster.keys():
			del temp_num_per_cluster[-1]
		if len(temp_num_per_cluster.values()) > 1:
			std_dev = np.std(temp_num_per_cluster.values())	
			mean = np.mean(temp_num_per_cluster.values())	
			rel_std_dev = std_dev / mean
			rel_std_dev *= pow(non_clustered/n, 2)
			print 'DEBUG: adjusted rel_std_dev = ', rel_std_dev
			std_dev = rel_std_dev
			# we accept the iteration if adjusted rel_std_dev is smaller, or 
			# if it is within the threshold and number of nonclustered points is smaller
			if (std_dev - min_std_dev) <= sec_threshold and non_clustered < s_min_non_clusterd:
				sec_criteria_fulfiled = True
			else:
				sec_criteria_fulfiled = False
			if std_dev < min_std_dev or sec_criteria_fulfiled:
				res_from_diff_params[iteration] = label_name_pairs
				nr_clusters_from_diff_params[iteration] = n_clusters
				non_clustered_from_diff_params[iteration] = non_clustered
				distribution_from_diff_params[iteration] = num_per_cluster
				min_std_dev = std_dev
				s_min_non_clusterd = non_clustered
				sec_best_iteration = iteration
				print 'this is second best iteration currently'		
		# ----------------------------------------------------------------------------------
		print '_______________________________________________________'
				

	best_found = False	
	best_n_clusters = 0
	best_non_clusterd = data.shape[0]
	best_distro = {-1:data.shape[0]}
	best_dec = ''	# name of dec file for best iteration

	s_best_found = False
	s_best_n_clusters = 0
	s_best_non_clusterd = data.shape[0]
	s_best_distro = {-1:data.shape[0]}
	s_dec = ''		# name of dec file for second best iteration

	# save .dec from best iteration
	print 'best_iteration= ', best_iteration
	print 'sec best iteration = ', sec_best_iteration
	if best_iteration >= 0:
		best_found = True
		best_n_clusters = nr_clusters_from_diff_params[best_iteration]
		best_non_clusterd = non_clustered_from_diff_params[best_iteration]
		best_distro = distribution_from_diff_params[best_iteration]
		best_dec = file + '_dbscan_' + str(best_n_clusters) + '_' + str(best_non_clusterd) +'_dist'+str(strategy)
		dec.write(path = res_folder, filename = best_dec, label_name_pairs = res_from_diff_params[best_iteration])
		print '.dec file %s for iteration %i saved.' %(res_folder+best_dec, best_iteration)
	if sec_best_iteration >= 0:
		if sec_best_iteration <> best_iteration:
			s_best_found = True
		s_best_n_clusters = nr_clusters_from_diff_params[sec_best_iteration]
		s_best_non_clusterd = non_clustered_from_diff_params[sec_best_iteration]
		s_best_distro = distribution_from_diff_params[sec_best_iteration]
		s_dec = file + '_dbscanSTD_' + str(s_best_n_clusters) + '_' + str(s_best_non_clusterd) +'_dist'+str(strategy)
		dec.write(path = res_folder, filename = s_dec, label_name_pairs = res_from_diff_params[sec_best_iteration])
		print '.dec file %s for iteration %i saved.' %(res_folder+s_dec, sec_best_iteration)	
	print '_______________________________________________________'
	print '_______________________________________________________'	
	gc.collect()
	return best_found, best_n_clusters, best_non_clusterd, best_distro, best_dec, data.shape[0], \
			s_best_found, s_best_n_clusters, s_best_non_clusterd, s_best_distro, s_dec
예제 #12
0
파일: lisp.py 프로젝트: ahmed532/lispy
def main():
    if len(sys.argv) == 2:
        leval_file(sys.argv[1])
    loop(lambda: lprint(leval(read(), lglobals)))
예제 #13
0
파일: main.py 프로젝트: maxsond/Blind
def prompt():
	p = "What do you do?"
	h = len(p)/2
	q = msg(y/2,x/2-h,0.1,p,'right')
	i = display.inp(q)
	read(i)
예제 #14
0
파일: eval.py 프로젝트: 602p/lel

@builtin("if", 3, is_fexpr=True)
def bi_if(args, scope):
    if eval(args[0], scope) == LNil:
        return eval(args[2], scope)
    else:
        return eval(args[1], scope)


@builtin("mkfunc", 4, argtypes=[LStr, None, None, LCons])
def bi_mkfunc(args, scope):
    return LFunc(args[0].value, not args[1].is_nil(), args[2], args[3].clone())


@builtin("fn1", 2, is_fexpr=True, argtypes=[LSym, None])
def bi_mkfunc(args, scope):
    return LFunc(
        "<fn1 lambda>", False,
        LCons.from_py_list([
            LSym("^let"), args[0],
            LCons.from_py_list([LSym("^lhs"), LSym("$args")]), args[1]
        ]), scope)


with open("test.fl", 'r') as fd:
    code = read(fd.read())
print("Code:", code.l_str())
result = eval(code, builtins)
print("Result:", result)
print("Result.l_str():", result.l_str())
예제 #15
0
def mcl(instance_path, res_folder, strategy=2):
	instances = instance_path.rsplit('/', 1)[0] + '/'
	file = instance_path.rsplit('/', 1)[1]
	input_type  = '.' + file.rsplit('.', 1)[1]
	file = file.rsplit('.', 1)[0]
	data, row_names = parse.read(instances + file + input_type)
	print 'Size of data matrix: ', data.shape
	if len(data) <> len(row_names):
		print 'MCL error: data and row_names have diff. lens', len(data), len(row_names)
		raise Exception
	#save_matrix_fig(data, res_folder, file+'_in')
	sim_matrix = []
	'''try:
		sim_matrix = np.load(res_folder+file+'_sim'+str(strategy)+'.npy')
		n = sim_matrix.shape[0]
		print 'Sim matrix %s found.' %(res_folder+file+'_sim'+str(strategy)+'.npy')
	except:
		print 'Sim matrix %s NOT found!!!!' %(res_folder+file+'_sim'+str(strategy)+'.npy')
		sim_matrix = pp.strategy(data, 'sim',strategy)	
		np.save(res_folder+file+'_sim'+str(strategy)+'.npy', sim_matrix)'''

	try:
		sim_matrix = scipy.io.mmread(res_folder+file+'_sim'+str(strategy)).tocsr()
		print 'Sim matrix %s found.' %(res_folder+file+'_sim'+str(strategy))
	except:
		print 'Sim matrix %s NOT found!!!!' %(res_folder+file+'_sim'+str(strategy))
		sim_matrix = pp.strategy(data, 'sim',strategy)	
		scipy.io.mmwrite(res_folder+file+'_sim'+str(strategy), sim_matrix)	
 	
	############################################
	# good visualization
	#draw(sim_matrix, colors=[0 for x in range(sim_matrix.shape[0])])
	############################################
	old_n_clusters = 0
	old_non_clustered = 0

	# list to save labels from all iterations, so we can later pick the best clustering
	res_from_diff_params = {}
	nr_clusters_from_diff_params = {}
	non_clustered_from_diff_params = {}
	distribution_from_diff_params = {}
	best_iteration = -1
	sec_best_iteration = -1
	print sim_matrix.shape
	n = sim_matrix.shape[0]
	min_non_clusterd = n
	s_min_non_clusterd = n
	min_std_dev = n
	sec_threshold = 0.0001
	iteration = 0
	perfect_cl_found = False
	
	min_inf_factor = 1.1
	max_inf_factor = 2.2
	inf_l = max_inf_factor - min_inf_factor

	nr_cl_with_diff_params = []
	total_time = 0
	# cluster the data with MCL ---------------------------------------------
	for exp_iter in range(2,3):
	#for exp_iter in range(2,5):
		nr_cl_with_diff_inf = []
		#for save_id, inf_iter in enumerate(np.arange(1.2, 1.85, 0.05)):
		for save_id, inf_iter in enumerate([1.3]):
		#for inf_iter in range(6):	
			gc.collect()
			labels = [-1 for x in range(n)]

			print '#############################################################'
			expand_factor = exp_iter
			#inflate_factor = 1.2 + inf_iter*0.4
			inflate_factor = inf_iter
			print 'DEBUG: iteration = ', iteration
			try:
				labels = np.load(res_folder+file+'_mcl_fix_'+str(save_id)+'_sim'+str(strategy)+'.npy')
				print 'Existing clustering found for expand = %i, inflate = %f' %(expand_factor,inflate_factor) 
				print res_folder+file+'_mcl_fix_'+str(save_id)+'_sim'+str(strategy)+'.npy'

			except:
				print 'NO Existing clustering found for expand = %i, inflate = %f' %(expand_factor,inflate_factor)
				print 'Expand Factor = ', expand_factor
				print 'Infalte Factor = ', inflate_factor
				#print 'NO existing clustering found for expand = %i, inflate = %i.' %(expand_factor,inflate_factor)
				# inflation weakens relations between clusters and strenhtens relations in clusters
				# expansion operator is responsible for allowing flow to connect different regions of the graph.
				# bigger the expand_factor, less clusters (too big, everything is in 1 cluster)
				start_time = time.time()
				clusters = mcl_implementation(sim_matrix, expand_factor = expand_factor, max_loop = 20,
									inflate_factor = inflate_factor)
				curr_time = (time.time() - start_time)
				print 'curr_time: ', curr_time
				total_time += curr_time
				clust_map = {}
				for k, vals in clusters.items():
					for v in vals:
						clust_map[v] = k

				colors = []
				for i in range(n):
					colors.append(clust_map.get(i, 100))

				set_of_colors = set(colors)	
				print 'DEBUG: n clusters = ', len(set_of_colors)

				

				cur_label = 0
				labels = colors
				for cluster in set_of_colors:
					labels = [cur_label if label == cluster else label for label in labels] 
					cur_label += 1
				labels = np.array(labels)

				np.save(res_folder+file+'_mcl_fix_'+str(save_id)+'_sim'+str(strategy)+'.npy', labels)

			# continue if all points in 1 cluster
			if len(set(labels)) == 1:
				print 'skiping this iteration...'
				nr_cl_with_diff_inf += [1]
				continue

			# change expand factor if all points in different clusters
			if len(set(labels)) >= 0.3*n:
				print 'moving to next expand factor...'
				break
			############################################
			# good visualization
			'''if iteration == 3 and file <> 'pp08aCUTS':
				draw(sim_matrix, colors=[label+1 for label in sotred_labels2])
			if iteration == 1 and file == 'pp08aCUTS':
				draw(sim_matrix, colors=[label+1 for label in sotred_labels2])'''
			############################################
			
			n_clusters = len(set(labels)) - (1 if -1 in labels else 0)


			nr_cl_with_diff_inf += [n_clusters]


			num_per_cluster = {}
			for i in range(n_clusters):
				num_per_cluster[i] = 0

			for label in labels:
				for i in range(n_clusters):
					if label == i:
						num_per_cluster[i] += 1; 
			non_clustered = 0;			
			for label in labels:
				if label == -1:
					non_clustered += 1	
			
			# criteria for skiping or breaking the loop ---------------------------------------------
			# skip the iteration if the number of clusters is as before
			if iteration == 0:
				old_n_clusters = n_clusters
				old_non_clustered = non_clustered
			if n_clusters == old_n_clusters and non_clustered == old_non_clustered and iteration > 0:
				continue
			old_n_clusters = n_clusters
			old_non_clustered = non_clustered
			
			# ---------------------------------------------------------------------------------------
			# display some information
			print 'Estimated number of clusters: ',  n_clusters		
			print 'Number of points per cluster: ', num_per_cluster
			#print 'REMOVE ME! '
			#continue
			# ---------------------------------------------------------------------------------------
			sorted_data, sotred_labels, sorted_names, column_labels = sort_matrix(data, labels, row_names)
			
			# pull down the points which have non-zero value that colides with points from other clusters
			sorted_data2, sotred_labels2, sorted_names2 = postp.remove_colision_points2(sorted_data, 
																	sotred_labels, sorted_names, column_labels)

			num_per_cluster = {}
			n_clusters = len(set(sotred_labels2)) - (1 if -1 in sotred_labels2 else 0)
			if -1 in sotred_labels2:
				all_clusters_list = range(-1, n_clusters)
			else:
				all_clusters_list = range(n_clusters)

			for i in all_clusters_list:
				num_per_cluster[i] = 0

			for label in sotred_labels2:
				for i in all_clusters_list:
					if label == i:
						num_per_cluster[i] += 1; 
			non_clustered = 0;			
			for label in sotred_labels2:
				if label == -1:
					non_clustered += 1	
			print 'Number of points per cluster after removal: ', num_per_cluster
			print 'Number of non clustered points after removal:', non_clustered
			
			# change expand factor if all points in different clusters
			if n_clusters >= 0.1*n:
				print 'moving to next expand factor...'
				break
			

			# save picture of end matrix
			#save_matrix_fig(sorted_data2, res_folder, file + '_A_dec' +  str(iteration))
		
			# find the best iteration (according to # of non clustered points), --------------
			# so we only save the best one 
			label_name_pairs = zip(sotred_labels2, sorted_names2)
			if non_clustered < min_non_clusterd:
				res_from_diff_params[iteration] = label_name_pairs
				nr_clusters_from_diff_params[iteration] = n_clusters
				non_clustered_from_diff_params[iteration] = non_clustered
				distribution_from_diff_params[iteration] = num_per_cluster
				min_non_clusterd = non_clustered
				best_iteration = iteration
				print 'this is best iteration currently'	
			if non_clustered == 0:
				print 'Perfect clustering found!'
				perfect_cl_found = True
				sec_best_iteration = iteration
				break;
			# find the best iteration (according variance of cluster sizes), ----------------
			# so we only save the best one 
			temp_num_per_cluster = num_per_cluster.copy()
			if -1 in temp_num_per_cluster.keys():
				del temp_num_per_cluster[-1]
			if len(temp_num_per_cluster.values()) > 1:
				std_dev = np.std(temp_num_per_cluster.values())	
				mean = np.mean(temp_num_per_cluster.values())	
				rel_std_dev = std_dev / mean
				rel_std_dev *= pow(non_clustered/n, 2)
				print 'DEBUG: adjusted rel_std_dev = ', rel_std_dev
				std_dev = rel_std_dev	
				# we accept the iteration if adjusted rel_std_dev is smaller, or 
				# if it is within the threshold and number of nonclustered points is smaller
				if (std_dev - min_std_dev) <= sec_threshold and non_clustered < s_min_non_clusterd:
					sec_criteria_fulfiled = True
				else:
					sec_criteria_fulfiled = False
				if std_dev < min_std_dev or sec_criteria_fulfiled:
					res_from_diff_params[iteration] = label_name_pairs
					nr_clusters_from_diff_params[iteration] = n_clusters
					non_clustered_from_diff_params[iteration] = non_clustered
					distribution_from_diff_params[iteration] = num_per_cluster
					min_std_dev = std_dev
					s_min_non_clusterd = non_clustered
					sec_best_iteration = iteration
					print 'this is second best iteration currently'			
			# ----------------------------------------------------------------------------------
			print '#############################################################'
			iteration += 1
		nr_cl_with_diff_params += [nr_cl_with_diff_inf]
		if perfect_cl_found:
			break;		
					

	best_found = False
	best_n_clusters = 0
	best_non_clusterd = data.shape[0]
	best_distro = {-1:data.shape[0]}
	best_dec = ''	# name of dec file for best iteration

	s_best_found = False
	s_best_n_clusters = 0
	s_best_non_clusterd = data.shape[0]
	s_best_distro = {-1:data.shape[0]}
	s_dec = ''		# name of dec file for second best iteration
	
	# save .dec from best iteration
	print 'best_iteration= ', best_iteration
	print 'sec best iteration = ', sec_best_iteration
	if best_iteration >= 0:
		best_found = True
		best_n_clusters = nr_clusters_from_diff_params[best_iteration]
		best_non_clusterd = non_clustered_from_diff_params[best_iteration]
		best_distro = distribution_from_diff_params[best_iteration]
		best_dec = file + '_mcl_' + str(best_n_clusters) + '_' + str(best_non_clusterd) +'_sim'+str(strategy)
		dec.write(path = res_folder, filename = best_dec, label_name_pairs = res_from_diff_params[best_iteration])
		print '.dec file %s for iteration %i saved.' %(res_folder+best_dec, best_iteration)
	if sec_best_iteration >= 0:
		if sec_best_iteration <> best_iteration:
			s_best_found = True
		s_best_n_clusters = nr_clusters_from_diff_params[sec_best_iteration]
		s_best_non_clusterd = non_clustered_from_diff_params[sec_best_iteration]
		s_best_distro = distribution_from_diff_params[sec_best_iteration]
		s_dec = file + '_mclSTD_' + str(s_best_n_clusters) + '_' + str(s_best_non_clusterd) +'_sim'+str(strategy)
		if sec_best_iteration <> best_iteration:
			dec.write(path = res_folder, filename = s_dec, label_name_pairs = res_from_diff_params[sec_best_iteration])
		print '.dec file %s for iteration %i saved.' %(res_folder+s_dec, sec_best_iteration)	

	print 'Total_time: ', total_time	
	print '_______________________________________________________'
	print '_______________________________________________________'	    
	gc.collect()

	return best_found, best_n_clusters, best_non_clusterd, best_distro, best_dec, data.shape[0], \
			s_best_found, s_best_n_clusters, s_best_non_clusterd, s_best_distro, s_dec
예제 #16
0
def em(instance_path, res_folder, strategy=2):
	instances = instance_path.rsplit('/', 1)[0] + '/'
	file = instance_path.rsplit('/', 1)[1]
	input_type  = '.' + file.rsplit('.', 1)[1]
	file = file.rsplit('.', 1)[0]
	data, row_names = parse.read(instances + file + input_type)
	print 'Size of data matrix: ', data.shape
	if len(data) <> len(row_names):
		print 'EM error: data and row_names have diff. lens', len(data), len(row_names)
	#save_matrix_fig(data, res_folder, file+'_in')

	old_n_clusters = 0
	old_non_clustered = 0

	# list to save labels from all iterations, so we can later pick the best clustering
	res_from_diff_params = {}
	nr_clusters_from_diff_params = {}
	non_clustered_from_diff_params = {}
	distribution_from_diff_params = {}
	best_iteration = -1
	sec_best_iteration = -1
	n = len(data)
	min_non_clusterd = n
	s_min_non_clusterd = n
	min_std_dev = n
	sec_threshold = 0.0001
	iteration = 0
	perfect_cl_found = False
	iteration = 0
	failed = False
	# cluster the data with EM ---------------------------------------------
	for K in range(2,4):
		for rand_iter in range(1):
			gc.collect()
			print '#############################################################'
			print 'File: ', file
			print 'DEBUG: iteration = ', iteration
			print 'DEBUG: K = %i, try = %i' %(K,rand_iter)

			try:
				labels = em_implementation(data, K = K)
			except ValueError:
				print 'FAILED'
				failed = True	
				break

			# continue if all in 1 cluster or all points in different cluster
			if len(set(labels)) == 1 or len(set(labels)) > 0.9 * n:
				#print 'DEBUG: labels: ', labels
				print 'skiping this iteration...'
				continue
			
			n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
			
			num_per_cluster = {}
			for i in range(n_clusters):
				num_per_cluster[i] = 0

			for label in labels:
				for i in range(n_clusters):
					if label == i:
						num_per_cluster[i] += 1; 
			non_clustered = 0;			
			for label in labels:
				if label == -1:
					non_clustered += 1	
			
			# criteria for skiping or breaking the loop ---------------------------------------------
			# skip the iteration if the number of clusters is as before
			if iteration == 0:
				old_n_clusters = n_clusters
				old_non_clustered = non_clustered
			#if n_clusters == old_n_clusters and non_clustered == old_non_clustered and iteration > 0:
			#	continue
			old_n_clusters = n_clusters
			old_non_clustered = non_clustered

			# ---------------------------------------------------------------------------------------
			# display some information
			print 'Estimated number of clusters: ',  n_clusters		
			print 'Number of points per cluster: ', num_per_cluster
			# this is the case where K is too big and some clusters stay empty
			if np.sum(num_per_cluster.values()) <> data.shape[0]:
				continue
				#raise Exception('FUUUUCK 2')
			# ---------------------------------------------------------------------------------------
			sorted_data, sotred_labels, sorted_names, column_labels = sort_matrix(data, labels, row_names)
			
			# pull down the points which have non-zero value that colides with points from other clusters
			sorted_data2, sotred_labels2, sorted_names2 = postp.remove_colision_points2(sorted_data, 
																	sotred_labels, sorted_names, column_labels)

			num_per_cluster = {}
			n_clusters = len(set(sotred_labels2)) - (1 if -1 in sotred_labels2 else 0)
			if -1 in sotred_labels2:
				all_clusters_list = range(-1, n_clusters)
			else:
				all_clusters_list = range(n_clusters)

			for i in all_clusters_list:
				num_per_cluster[i] = 0

			for label in sotred_labels2:
				for i in all_clusters_list:
					if label == i:
						num_per_cluster[i] += 1; 
			non_clustered = 0;			
			for label in sotred_labels2:
				if label == -1:
					non_clustered += 1	
			print 'Number of points per cluster after removal: ', num_per_cluster
			print 'Number of non clustered points after removal:', non_clustered
			if 0 in num_per_cluster.values():
				print 'TIME TO DEBUG:'
				print 'sotred_labels2 = ', sotred_labels2

			# save picture of end matrix
			#save_matrix_fig(sorted_data2, res_folder, file + '_A_dec' +  str(iteration))
		
			# find the best iteration (according to # of non clustered points), --------------
			# so we only save the best one 
			label_name_pairs = zip(sotred_labels2, sorted_names2)
			if non_clustered < min_non_clusterd:
				res_from_diff_params[iteration] = label_name_pairs
				nr_clusters_from_diff_params[iteration] = n_clusters
				non_clustered_from_diff_params[iteration] = non_clustered
				distribution_from_diff_params[iteration] = num_per_cluster
				min_non_clusterd = non_clustered
				best_iteration = iteration
				print 'this is best iteration currently'	
			if non_clustered == 0:
				print 'Perfect clustering found!'
				perfect_cl_found = True
				sec_best_iteration = iteration
				break;
			# find the best iteration (according variance of cluster sizes), ----------------
			# so we only save the best one 
			temp_num_per_cluster = num_per_cluster.copy()
			if -1 in temp_num_per_cluster.keys():
				del temp_num_per_cluster[-1]
			if len(temp_num_per_cluster.values()) > 1:
				std_dev = np.std(temp_num_per_cluster.values())	
				mean = np.mean(temp_num_per_cluster.values())	
				rel_std_dev = std_dev / mean
				rel_std_dev *= pow(non_clustered/n, 2)
				print 'DEBUG: adjusted rel_std_dev = ', rel_std_dev
				std_dev = rel_std_dev	
				# we accept the iteration if adjusted rel_std_dev is smaller, or 
				# if it is within the threshold and number of nonclustered points is smaller
				if (std_dev - min_std_dev) <= sec_threshold and non_clustered < s_min_non_clusterd:
					sec_criteria_fulfiled = True
				else:
					sec_criteria_fulfiled = False
				if std_dev < min_std_dev or sec_criteria_fulfiled:
					res_from_diff_params[iteration] = label_name_pairs
					nr_clusters_from_diff_params[iteration] = n_clusters
					non_clustered_from_diff_params[iteration] = non_clustered
					distribution_from_diff_params[iteration] = num_per_cluster
					min_std_dev = std_dev
					s_min_non_clusterd = non_clustered
					sec_best_iteration = iteration
					print 'this is second best iteration currently'		
			# ----------------------------------------------------------------------------------
			print '#############################################################'
			iteration += 1
		if failed:
			break	
	
					

	best_found = False
	best_n_clusters = 0
	best_non_clusterd = data.shape[0]
	best_distro = {-1:data.shape[0]}
	best_dec = ''	# name of dec file for best iteration

	s_best_found = False
	s_best_n_clusters = 0
	s_best_non_clusterd = data.shape[0]
	s_best_distro = {-1:data.shape[0]}
	s_dec = ''		# name of dec file for second best iteration
	
	# save .dec from best iteration
	print 'best_iteration= ', best_iteration
	print 'sec best iteration = ', sec_best_iteration
	if best_iteration >= 0:
		best_found = True
		best_n_clusters = nr_clusters_from_diff_params[best_iteration]
		best_non_clusterd = non_clustered_from_diff_params[best_iteration]
		best_distro = distribution_from_diff_params[best_iteration]
		best_dec = file + '_em_' + str(best_n_clusters) + '_' + str(best_non_clusterd) +'_str'+str(strategy)
		dec.write(path = res_folder, filename = best_dec, label_name_pairs = res_from_diff_params[best_iteration])
		print '.dec file %s for iteration %i saved.' %(res_folder+best_dec, best_iteration)
	if sec_best_iteration >= 0:
		if sec_best_iteration <> best_iteration:
			s_best_found = True
		s_best_n_clusters = nr_clusters_from_diff_params[sec_best_iteration]
		s_best_non_clusterd = non_clustered_from_diff_params[sec_best_iteration]
		s_best_distro = distribution_from_diff_params[sec_best_iteration]
		s_dec = file + '_emSTD_' + str(s_best_n_clusters) + '_' + str(s_best_non_clusterd) +'_str'+str(strategy)
		dec.write(path = res_folder, filename = s_dec, label_name_pairs = res_from_diff_params[sec_best_iteration])
		print '.dec file %s for iteration %i saved.' %(res_folder+s_dec, sec_best_iteration)	
	print '_______________________________________________________'
	print '_______________________________________________________'	    
	gc.collect()
	return best_found, best_n_clusters, best_non_clusterd, best_distro, best_dec, data.shape[0], \
			s_best_found, s_best_n_clusters, s_best_non_clusterd, s_best_distro, s_dec