def whole_model(**kwargs): read(kwargs['link'], kwargs['input_dim']) _, _, _, _, auto_runtime, auto_err = \ autoencoder(kwargs['epoch'], kwargs['batch'], kwargs['latent'], kwargs['encoder_o'], kwargs['encoder_i'], kwargs['decoder_i'], kwargs['decoder_o'], kwargs['train_percent'], kwargs['lam'], kwargs['norm_order'], kwargs['loss_plot']) _, svm_runtime, svm_err = classify(kwargs['gamma'], kwargs['c'], kwargs['train_percent']) return auto_runtime, auto_err, svm_runtime, svm_err
def intro2(): display.clear(display.lwin) display.msg("./program",False) display.clear(display.rwin) display.inp() display.clear(display.lwin) display.msg("Hail, Program.",False) display.clear(display.lwin) display.msg("Would you kindly open the pod bay doors?",False) display.clear(display.rwin) display.inp() display.clear(display.lwin) display.msg("Oh, right, you're the new generation.",False) display.clear(display.lwin) display.msg("First, you need to access the airlock.",False) display.clear(display.lwin) display.msg("Just type 'cr airlock' to access the airlock systems.",False) display.clear(display.rwin) parse.read(display.inp())
def run(): debug = Trace(debugLevel) if debugLevel >= 1: debug.writeTrace() #Check if command line call or not if len(sys.argv) > 1: fname = sys.argv[1] #File to analyse is passed as the first argument else: #Backup file for testing purposes fname = r"C:\Users\jordan\Documents\GitHub\javaParser\SampleJavaFiles\ToThePowerOf.java" parent = parse.read(fname, debug) #walker.getNodeCount(parent) #parse.printTree(parent) recursions, loops = scan.detect(parent, debug) scan.output(recursions, loops)
def populate(): cols, rows = parse.read() with transaction.atomic(): all_votes = [] for row in rows: kraj, _ = Kraj.objects.get_or_create(name='Polska') wojewodztwo, _ = Wojewodztwo.objects.get_or_create(name=row[0]) powiat, _ = Powiat.objects.get_or_create(name=row[4], wojewodztwo=wojewodztwo) gmina, _ = Gmina.objects.get_or_create(name=row[3], code=row[2], powiat=powiat) for i in range(11, 23): candidate, _ = Candidate.objects.get_or_create(name=cols[i]) vote = constructVote(wojewodztwo, powiat, gmina, candidate, row[i]) all_votes.append(vote) voters = int(row[6]) ballots = int(row[7]) gmina.voters += voters gmina.ballots += ballots powiat.voters += voters powiat.ballots += ballots wojewodztwo.voters += voters wojewodztwo.ballots += ballots kraj.voters += voters kraj.ballots += ballots gmina.save() powiat.save() wojewodztwo.save() kraj.save() print(len(all_votes)) Vote.objects.bulk_create(all_votes) print(Vote.objects.all())
def populate(): cols, rows = parse.read() with transaction.atomic(): all_votes = [] for row in rows: kraj, _ = Kraj.objects.get_or_create(name = 'Polska') wojewodztwo, _ = Wojewodztwo.objects.get_or_create(name = row[0]) okreg, _ = Okreg.objects.get_or_create(name = row[1], wojewodztwo = wojewodztwo) gmina, _ = Gmina.objects.get_or_create(name = row[3], code = row[2], okreg = okreg) for i in range(11, 23): candidate, _ = Candidate.objects.get_or_create(name = cols[i]) vote = constructVote(wojewodztwo, okreg, gmina, candidate, row[i]) all_votes.append(vote) max_votes = int(row[6]) valid_votes = int(row[7]) gmina.max_votes += max_votes gmina.valid_votes += valid_votes okreg.max_votes += max_votes okreg.valid_votes += valid_votes wojewodztwo.max_votes += max_votes wojewodztwo.valid_votes += valid_votes kraj.max_votes += max_votes kraj.valid_votes += valid_votes gmina.save() okreg.save() wojewodztwo.save() kraj.save() print(len(all_votes)) Vote.objects.bulk_create(all_votes) print(Vote.objects.all())
def main(): args = parse.set() config = Config() address = parse.read(args, config) if not config.api_key: if not config.api_key: print("API key not found, run again with --key apikey") return if (address): location = Location(address, config.location_url) if (location.address): weather = Weather(location, config) display.show(location, weather) return if not args.key: print('No address supplied')
#files+= ["gap4_2.txt.lp"] #files = [f for f in listdir(directory) if isfile(join(directory, f))] files += ['/home/'+username+'/Desktop/instances/toTest/new46.lp'] files += ['/home/'+username+'/Desktop/instances/toTest/new47.lp'] files += ['/home/'+username+'/Desktop/instances/miplib/fiber.mps'] files += ['/home/'+username+'/Desktop/instances/miplib/10teams.mps'] files += ['/home/'+username+'/Desktop/instances/miplib/rout.mps'] files += ['/home/'+username+'/Desktop/instances/miplib/noswot.mps'] files += ['/home/'+username+'/Desktop/instances/miplib/modglob.mps'] files += ['/home/'+username+'/Desktop/instances/miplib/gesa2.mps'] files += ['/home/'+username+'/Desktop/instances/miplib/vpm2.mps'] files += ['/home/'+username+'/Desktop/instances/ORLib/airland/airland1R2.mps'] for file in files: if not (file.endswith('.mps') or file.endswith('.lp')): continue #path = directory + file path = file data, row_names = parse.read(path) sim_matrix = pp.strategy(data, 'sim', 2) print 'Occupancy: ', len(sim_matrix.nonzero()[0]) / (sim_matrix.shape[0] * sim_matrix.shape[0]) #save_matrix_file(dist_matrix, directory, file+'dist_mat') #save_matrix_fig(data, directory, file) print file, data.shape gc.collect() #print 'dist_matrix.shape', dist_matrix.shape print '-----------------------------------------------'
def agg(instance_path, res_folder, strategy = 2): instances = instance_path.rsplit('/', 1)[0] + '/' file = instance_path.rsplit('/', 1)[1] input_type = '.' + file.rsplit('.', 1)[1] file = file.rsplit('.', 1)[0] data, row_names = parse.read(instances + file + input_type) print 'Size of data matrix: ', data.shape if len(data) <> len(row_names): print 'DBSCAN error: data and row_names have diff. lens', len(data), len(row_names) #save_matrix_fig(data, res_folder, file+'_in') dist_matrix = [] # try: dist_matrix = scipy.io.mmread(res_folder+file+'_dist'+str(strategy)).toarray() print 'Distance matrix %s found.' %(res_folder+file+'_dist'+str(strategy)) except: print 'Distance matrix %s NOT found!!!!' %(res_folder+file+'_dist'+str(strategy)) dist_matrix = pp.strategy(data, 'distance',strategy) scipy.io.mmwrite(res_folder+file+'_dist'+str(strategy),dist_matrix) dist_matrix = dist_matrix.toarray() dist_matrix[dist_matrix == 0] = 1000 #float('inf') for i in range(dist_matrix.shape[0]): for j in range(i+1): dist_matrix[i,j] = 0 old_n_clusters = 0 old_non_clustered = 0 # list to save labels from all iterations, so we can later pick the best clustering res_from_diff_params = {} nr_clusters_from_diff_params = {} non_clustered_from_diff_params = {} distribution_from_diff_params = {} best_iteration = -1 sec_best_iteration = -1 n = dist_matrix.shape[0] min_non_clusterd = n s_min_non_clusterd = n max_std_dev = n sec_threshold = 0.0001 n_iterations = 20 # must be an odd number # cluster the data with hiararhical --------------------------------------------- print 'Running Agglomerative Clustering...' z = hierarchy.linkage(dist_matrix, method='complete') knee = np.diff(z[::-1, 2], 2) print 'z = ', z max_n_cl = [] # find the knees for i in range(n_iterations): temp_n_cl = knee.argmax() + 2 knee[knee.argmax()] = 0 if temp_n_cl < 0.5*n: print 'temp_n_cl = ', temp_n_cl max_n_cl.append(temp_n_cl) max_n_cl = np.unique(max_n_cl) for iteration in range(len(max_n_cl)): print 'iteration = ', iteration labels = hierarchy.fcluster(z, max_n_cl[iteration], 'maxclust') # we need this because algorithm returns labels starting from 1, but we want them to start from 0 labels = np.array([label-1 for label in labels]) n_clusters = len(set(labels)) - (1 if -1 in labels else 0) num_per_cluster = {} for i in range(n_clusters): num_per_cluster[i] = 0 for label in labels: for i in range(n_clusters): if label == i: num_per_cluster[i] += 1 # --------------------------------------------------------------------------------------- # display some information print 'Estimated number of clusters: ', n_clusters print 'Number of points per cluster: ', num_per_cluster #draw(A=dist_matrix, colors=labels) # --------------------------------------------------------------------------------------- sorted_data, sotred_labels, sorted_names, column_labels = sort_matrix(data, labels, row_names) # pull down the points which have non-zero value that colides with points from other clusters sorted_data2, sotred_labels2, sorted_names2 = postp.remove_colision_points2(sorted_data, sotred_labels, sorted_names, column_labels) num_per_cluster = {} n_clusters = len(set(sotred_labels2)) - (1 if -1 in sotred_labels2 else 0) if -1 in sotred_labels2: all_clusters_list = range(-1, n_clusters) else: all_clusters_list = range(n_clusters) for i in all_clusters_list: num_per_cluster[i] = 0 for label in sotred_labels2: for i in all_clusters_list: if label == i: num_per_cluster[i] += 1; non_clustered = 0; for label in sotred_labels2: if label == -1: non_clustered += 1 if n_clusters == 1 and non_clustered == 0: continue print 'Estimated number of clusters after removal: ', n_clusters print 'Number of points per cluster after removal: ', num_per_cluster print 'Number of non clustered points after removal:', non_clustered if 0 in num_per_cluster.values(): print 'TIME TO DEBUG:' print 'sotred_labels2 = ', sotred_labels2 # save picture of end matrix #save_matrix_fig(sorted_data, res_folder, file + '_A_dec' + str(iteration)) #if res2_folder <> 'none': #save_matrix_fig(sorted_data2, res2_folder, file + '_A_dec' + str(iteration)) # find the best iteration, so we only save the best one -------------------------- label_name_pairs = zip(sotred_labels2, sorted_names2) if non_clustered < min_non_clusterd: res_from_diff_params[iteration] = label_name_pairs nr_clusters_from_diff_params[iteration] = n_clusters non_clustered_from_diff_params[iteration] = non_clustered distribution_from_diff_params[iteration] = num_per_cluster min_non_clusterd = non_clustered if n_clusters > 1: second_best = iteration best_iteration = iteration print 'this is best iteration currently' # find the best iteration (according variance of cluster sizes), ---------------- # so we only save the best one temp_num_per_cluster = num_per_cluster.copy() if -1 in temp_num_per_cluster.keys(): del temp_num_per_cluster[-1] if len(temp_num_per_cluster.values()) > 1: std_dev = np.std(temp_num_per_cluster.values()) mean = np.mean(temp_num_per_cluster.values()) rel_std_dev = std_dev / mean rel_std_dev *= pow(non_clustered/n, 2) print 'DEBUG: adjusted rel_std_dev = ', rel_std_dev std_dev = rel_std_dev # we accept the iteration if adjusted rel_std_dev is smaller, or # if it is within the threshold and number of nonclustered points is smaller if (std_dev - max_std_dev) <= sec_threshold and non_clustered < s_min_non_clusterd: sec_criteria_fulfiled = True else: sec_criteria_fulfiled = False if std_dev < max_std_dev or sec_criteria_fulfiled: res_from_diff_params[iteration] = label_name_pairs nr_clusters_from_diff_params[iteration] = n_clusters non_clustered_from_diff_params[iteration] = non_clustered distribution_from_diff_params[iteration] = num_per_cluster max_std_dev = std_dev s_min_non_clusterd = non_clustered sec_best_iteration = iteration print 'this is second best iteration currently' print '_______________________________________________________' # ---------------------------------------------------------------------------------- best_found = False best_n_clusters = 0 best_non_clusterd = data.shape[0] best_distro = {-1:data.shape[0]} best_dec = '' # name of dec file for best iteration s_best_found = False s_best_n_clusters = 0 s_best_non_clusterd = data.shape[0] s_best_distro = {-1:data.shape[0]} s_dec = '' # name of dec file for second best iteration # save .dec from best iteration print 'best_iteration= ', best_iteration print 'sec best iteration = ', sec_best_iteration if best_iteration >= 0: best_found = True best_n_clusters = nr_clusters_from_diff_params[best_iteration] best_non_clusterd = non_clustered_from_diff_params[best_iteration] best_distro = distribution_from_diff_params[best_iteration] best_dec = file + '_agg_' + str(best_n_clusters) + '_' + str(best_non_clusterd) dec.write(path = res_folder, filename = best_dec, label_name_pairs = res_from_diff_params[best_iteration]) print '.dec file %s for iteration %i saved.' %(res_folder+best_dec, best_iteration) if sec_best_iteration >= 0: if sec_best_iteration <> best_iteration: s_best_found = True s_best_n_clusters = nr_clusters_from_diff_params[sec_best_iteration] s_best_non_clusterd = non_clustered_from_diff_params[sec_best_iteration] s_best_distro = distribution_from_diff_params[sec_best_iteration] s_dec = file + '_aggSTD_' + str(s_best_n_clusters) + '_' + str(s_best_non_clusterd) dec.write(path = res_folder, filename = s_dec, label_name_pairs = res_from_diff_params[sec_best_iteration]) print '.dec file %s for iteration %i saved.' %(res_folder+s_dec, sec_best_iteration) print '_______________________________________________________' print '_______________________________________________________' gc.collect() return best_found, best_n_clusters, best_non_clusterd, best_distro, best_dec, data.shape[0], \ s_best_found, s_best_n_clusters, s_best_non_clusterd, s_best_distro, s_dec
def affProp(instance_path, res_folder, strategy = 2): instances = instance_path.rsplit('/', 1)[0] + '/' file = instance_path.rsplit('/', 1)[1] input_type = '.' + file.rsplit('.', 1)[1] file = file.rsplit('.', 1)[0] data, row_names = parse.read(instances + file + input_type) print 'Size of data matrix: ', data.shape if len(data) <> len(row_names): print 'Af prop error: data and row_names have diff. lens', len(data), len(row_names) #save_matrix_fig(data, res_folder, file+'_in') sim_matrix = [] # try: sim_matrix = np.load(res_folder+file+'_sim'+str(strategy)+'.npy') print 'Sim matrix %s found.' %(res_folder+file+'_sim'+str(strategy)+'.npy') except: print 'Sim matrix %s NOT found!!!!' %(res_folder+file+'_sim'+str(strategy)+'.npy') sim_matrix = pp.strategy(data, 'sim',strategy) np.save(res_folder+file+'_sim'+str(strategy), sim_matrix) old_n_clusters = 0 old_non_clustered = 0 # list to save labels from all iterations, so we can later pick the best clustering res_from_diff_params = {} nr_clusters_from_diff_params = {} non_clustered_from_diff_params = {} distribution_from_diff_params = {} best_iteration = -1 sec_best_iteration = -1 n = sim_matrix.shape[0] min_non_clusterd = n s_min_non_clusterd = n max_std_dev = n sec_threshold = 0.0001 n_iterations = 20 # must be an odd number sim_matrix[sim_matrix == 0] = -1e10 #min_preferance = 0 #min_preferance *= np.max(sim_matrix[sim_matrix > 0]) min_preferance = np.min(sim_matrix[sim_matrix > 0]) -10 max_preferance = np.median(sim_matrix[sim_matrix > 0]) print 'min_preferance, ', min_preferance print 'max_preferance, ', max_preferance if min_preferance > max_preferance: raise Exception('Something is wrong with preferance setting: %d %d', min_preferance, max_preferance) elif min_preferance == max_preferance: n_iterations = 1 pref_list = [min_preferance] pref_step = (max_preferance-min_preferance) / n_iterations # cluster the data with DBSCAN --------------------------------------------- for iteration in range(n_iterations): if iteration == 0: preference = min_preferance else: preference += pref_step labels = [] print '_______________________________________________________' print 'Aff. Prop. with preferance =', preference _, labels = affinity_propagation(sim_matrix, preference=preference) n_clusters = len(set(labels)) - (1 if -1 in labels else 0) num_per_cluster = {} for i in range(n_clusters): num_per_cluster[i] = 0 for label in labels: for i in range(n_clusters): if label == i: num_per_cluster[i] += 1; # TODO: criteria for skiping or breaking the loop --------------------------------------------- # skip the iteration if the number of clusters is as before if iteration == 0: old_n_clusters = n_clusters #elif n_clusters >= old_n_clusters: # break old_n_clusters = n_clusters # increase the preferance if n_clusters == 1: print 'DEBUG: Aff prop. n_clusters == 1, going to next iteration' min_preferance = preference max_preferance += (max_preferance - min_preferance) / 2 pref_step = (max_preferance-min_preferance) / (n_iterations-iteration) print 'min = %f, max = %f, step = %f' %(min_preferance, max_preferance, pref_step) continue # lower the preferance if n_clusters >= 0.1*n: print 'DEBUG: Aff prop. n_clusters = %i, TOO HIGH!!!' %n_clusters max_preferance = preference min_preferance = preference - pref_step pref_step = (max_preferance-min_preferance) / (n_iterations-iteration) print 'min = %f, max = %f, step = %f' %(min_preferance, max_preferance, pref_step) continue # --------------------------------------------------------------------------------------- # display some information print 'Estimated number of clusters: ', n_clusters print 'Number of points per cluster: ', num_per_cluster #draw(A=sim_matrix, colors=labels) # --------------------------------------------------------------------------------------- sorted_data, sotred_labels, sorted_names, column_labels = sort_matrix(data, labels, row_names) #print 'DEBUG:' #print 'column_labels = ', column_labels #print 'sotred_labels = ', sotred_labels #save_matrix_fig(sorted_data, res_folder, file + '_B_dec' + str(iteration)) # pull down the points which have non-zero value that colides with points from other clusters sorted_data2, sotred_labels2, sorted_names2 = postp.remove_colision_points2(sorted_data, sotred_labels, sorted_names, column_labels) num_per_cluster = {} n_clusters = len(set(sotred_labels2)) - (1 if -1 in sotred_labels2 else 0) if -1 in sotred_labels2: all_clusters_list = range(-1, n_clusters) else: all_clusters_list = range(n_clusters) for i in all_clusters_list: num_per_cluster[i] = 0 for label in sotred_labels2: for i in all_clusters_list: if label == i: num_per_cluster[i] += 1; non_clustered = 0; for label in sotred_labels2: if label == -1: non_clustered += 1 print 'Estimated number of clusters after removal: ', n_clusters print 'Number of points per cluster after removal: ', num_per_cluster print 'Number of non clustered points after removal:', non_clustered if 0 in num_per_cluster.values(): print 'TIME TO DEBUG:' print 'sotred_labels2 = ', sotred_labels2 # save picture of end matrix #save_matrix_fig(sorted_data2, res_folder, file + '_A_dec' + str(iteration)) #if res2_folder <> 'none': #save_matrix_fig(sorted_data2, res2_folder, file + '_A_dec' + str(iteration)) # find the best iteration, so we only save the best one -------------------------- label_name_pairs = zip(sotred_labels2, sorted_names2) if non_clustered < min_non_clusterd: res_from_diff_params[iteration] = label_name_pairs nr_clusters_from_diff_params[iteration] = n_clusters non_clustered_from_diff_params[iteration] = non_clustered distribution_from_diff_params[iteration] = num_per_cluster min_non_clusterd = non_clustered if n_clusters > 1: second_best = iteration best_iteration = iteration print 'this is best iteration currently' # find the best iteration (according variance of cluster sizes), ---------------- # so we only save the best one temp_num_per_cluster = num_per_cluster.copy() if -1 in temp_num_per_cluster.keys(): del temp_num_per_cluster[-1] if len(temp_num_per_cluster.values()) > 1: std_dev = np.std(temp_num_per_cluster.values()) mean = np.mean(temp_num_per_cluster.values()) rel_std_dev = std_dev / mean rel_std_dev *= pow(non_clustered/n, 2) print 'DEBUG: adjusted rel_std_dev = ', rel_std_dev std_dev = rel_std_dev # we accept the iteration if adjusted rel_std_dev is smaller, or # if it is within the threshold and number of nonclustered points is smaller if (std_dev - max_std_dev) <= sec_threshold and non_clustered < s_min_non_clusterd: sec_criteria_fulfiled = True else: sec_criteria_fulfiled = False if std_dev < max_std_dev or sec_criteria_fulfiled: res_from_diff_params[iteration] = label_name_pairs nr_clusters_from_diff_params[iteration] = n_clusters non_clustered_from_diff_params[iteration] = non_clustered distribution_from_diff_params[iteration] = num_per_cluster max_std_dev = std_dev s_min_non_clusterd = non_clustered sec_best_iteration = iteration print 'this is second best iteration currently' # ---------------------------------------------------------------------------------- print '_______________________________________________________' best_found = False best_n_clusters = 0 best_non_clusterd = data.shape[0] best_distro = {-1:data.shape[0]} best_dec = '' # name of dec file for best iteration s_best_found = False s_best_n_clusters = 0 s_best_non_clusterd = data.shape[0] s_best_distro = {-1:data.shape[0]} s_dec = '' # name of dec file for second best iteration # save .dec from best iteration print 'best_iteration= ', best_iteration print 'sec best iteration = ', sec_best_iteration if best_iteration >= 0: best_found = True best_n_clusters = nr_clusters_from_diff_params[best_iteration] best_non_clusterd = non_clustered_from_diff_params[best_iteration] best_distro = distribution_from_diff_params[best_iteration] best_dec = file + '_affProp_' + str(best_n_clusters) + '_' + str(best_non_clusterd) dec.write(path = res_folder, filename = best_dec, label_name_pairs = res_from_diff_params[best_iteration]) print '.dec file %s for iteration %i saved.' %(res_folder+best_dec, best_iteration) if sec_best_iteration >= 0: if sec_best_iteration <> best_iteration: s_best_found = True s_best_n_clusters = nr_clusters_from_diff_params[sec_best_iteration] s_best_non_clusterd = non_clustered_from_diff_params[sec_best_iteration] s_best_distro = distribution_from_diff_params[sec_best_iteration] s_dec = file + '_affPropSTD_' + str(s_best_n_clusters) + '_' + str(s_best_non_clusterd) dec.write(path = res_folder, filename = s_dec, label_name_pairs = res_from_diff_params[sec_best_iteration]) print '.dec file %s for iteration %i saved.' %(res_folder+s_dec, sec_best_iteration) print '_______________________________________________________' print '_______________________________________________________' gc.collect() return best_found, best_n_clusters, best_non_clusterd, best_distro, best_dec, data.shape[0], \ s_best_found, s_best_n_clusters, s_best_non_clusterd, s_best_distro, s_dec
import optparse from alignment import read_neg_polarity_items, get_full_alignment, print_alignment from parse import read_in_fixed_parses, read, read_and_parse, write_out_parses parser = optparse.OptionParser() parser.add_option('-p', '--parses', dest="ud_file", help="file with UD parses") parser.add_option('-a', '--amrs', dest="amr_file", help="file with AMRs") parser.add_option('-o', '--output', dest="output_file", help="alignment file") parser.add_option( '-w', '--write_out', dest="write_out_ud", default="", help="file to which write UD parses; if none, don't write out") (opts, _) = parser.parse_args() if opts.ud_file: sentences = read_in_fixed_parses(read(opts.amr_file), opts.ud_file) else: sentences = read_and_parse(opts.amr_file) if opts.write_out_ud: write_out_parses(sentences, opts.write_out_ud) neg_dict = read_neg_polarity_items('neg-polarity.txt') alignments = get_full_alignment(sentences, neg_dict) print_alignment(alignments, opts.output_file)
def dbscan(instance_path, res_folder, strategy = 2): instances = instance_path.rsplit('/', 1)[0] + '/' file = instance_path.rsplit('/', 1)[1] input_type = '.' + file.rsplit('.', 1)[1] file = file.rsplit('.', 1)[0] data, row_names = parse.read(instances + file + input_type) print 'Size of data matrix: ', data.shape if len(data) <> len(row_names): print 'DBSCAN error: data and row_names have diff. lens', len(data), len(row_names) #save_matrix_fig(data, res_folder, file+'_in') dist_matrix = [] try: dist_matrix = scipy.io.mmread(res_folder+file+'_dist'+str(strategy)).tocsr() print 'Distance matrix %s found.' %(res_folder+file+'_dist'+str(strategy)) except: print 'Distance matrix %s NOT found!!!!' %(res_folder+file+'_dist'+str(strategy)) dist_matrix = pp.strategy(data, 'distance',strategy) scipy.io.mmwrite(res_folder+file+'_dist'+str(strategy),dist_matrix) # this part is important for BPP-like instances!!! --------------------------------------- '''if check_if_BPP_like(dist_matrix): bpp_like = all(True if x == 0 or x == 1 else False for x in np.nditer(dist_matrix)) else: bpp_like = False''' bpp_like = False occupancy = len(dist_matrix.data) / (dist_matrix.shape[0] * dist_matrix.shape[1]) * 100 q = 10 dist_percentile = np.percentile(a=dist_matrix.data, q=q, axis=None) print 'dist_percentile = ', dist_percentile if dist_percentile == 0: # or strategy == 6: q = 1 print 'Recalculating dist_percentile..' #dist_percentile = np.percentile(a=dist_matrix, q=q) dist_percentile = np.percentile(a=dist_matrix.data, q=q, axis=None) print 'dist_percentile = ', dist_percentile old_n_clusters = 0 old_non_clustered = 0 # list to save labels from all iterations, so we can later pick the best clustering res_from_diff_params = {} nr_clusters_from_diff_params = {} non_clustered_from_diff_params = {} distribution_from_diff_params = {} best_iteration = -1 sec_best_iteration = -1 n = dist_matrix.shape[0] min_non_clusterd = n s_min_non_clusterd = n min_std_dev = n sec_threshold = 0.0001 n_iterations = 49 # must be an odd number eps_list = get_eps_list(mid=dist_percentile, length=n_iterations, strategy=strategy) print 'eps_list = ', eps_list # cluster the data with DBSCAN --------------------------------------------- for iteration in range(n_iterations): gc.collect() if dist_percentile == 0 and not bpp_like: print 'dist_percentile = %i, -> we cannot use DBSCAN for clustering this instance.' %dist_percentile break # eps is in range: [dist_percentile - 0.5, dist_percentile + 0.5] but with geometric progression eps = eps_list[iteration] if eps <= 0 and not bpp_like: continue if eps >= 1 and not bpp_like: break # for distance strategy 1: 0.054... #eps = 0.1 + (iteration / 10) min_samples = 4 #print 'DEBUG: eps = ', eps labels = [] print '_______________________________________________________' print 'iteration= ', iteration print 'eps = ', eps print 'min_samples = ', min_samples if bpp_like: print 'DEBUG: Running getLabelsFrom01Dist()' labels = getLabelsFrom01Dist(dist_matrix) else: print 'Running DBSCAN...' try: db = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed').fit(dist_matrix) except ValueError: print 'Value error occured in DBSCAN. Stop.' raise break labels = db.labels_ n_clusters = len(set(labels)) - (1 if -1 in labels else 0) num_per_cluster = {} for i in range(n_clusters): num_per_cluster[i] = 0 for label in labels: for i in range(n_clusters): if label == i: num_per_cluster[i] += 1; non_clustered = 0; for label in labels: if label == -1: non_clustered += 1 # criteria for skiping or breaking the loop --------------------------------------------- # skip the iteration if the number of clusters is as before if iteration == 0: old_n_clusters = n_clusters old_non_clustered = non_clustered if n_clusters == old_n_clusters and non_clustered == old_non_clustered and iteration > 0: continue old_n_clusters = n_clusters old_non_clustered = non_clustered if n_clusters == 1 and non_clustered == 0: print 'Stopping because bigger EPS will be the same.' break # --------------------------------------------------------------------------------------- # display some information print 'Estimated number of clusters: ', n_clusters print 'Number of points per cluster: ', num_per_cluster print 'Number of non clustered points:', non_clustered #draw(A=sim_matrix, colors=labels) # --------------------------------------------------------------------------------------- sorted_data, sotred_labels, sorted_names, column_labels = sort_matrix(data, labels, row_names) #print 'DEBUG:' #print 'column_labels = ', column_labels #print 'sotred_labels = ', sotred_labels #save_matrix_fig(sorted_data, res_folder, file + '_B_dec' + str(iteration)) # pull down the points which have non-zero value that colides with points from other clusters sorted_data2, sotred_labels2, sorted_names2 = postp.remove_colision_points2(sorted_data, sotred_labels, sorted_names, column_labels) num_per_cluster = {} n_clusters = len(set(sotred_labels2)) - (1 if -1 in sotred_labels2 else 0) if -1 in sotred_labels2: all_clusters_list = range(-1, n_clusters) else: all_clusters_list = range(n_clusters) for i in all_clusters_list: num_per_cluster[i] = 0 for label in sotred_labels2: for i in all_clusters_list: if label == i: num_per_cluster[i] += 1; non_clustered = 0; for label in sotred_labels2: if label == -1: non_clustered += 1 print 'Estimated number of clusters after removal: ', n_clusters print 'Number of points per cluster after removal: ', num_per_cluster print 'Number of non clustered points after removal:', non_clustered if 0 in num_per_cluster.values(): print 'TIME TO DEBUG:' print 'sotred_labels2 = ', sotred_labels2 # save picture of end matrix #save_matrix_fig(sorted_data2, res_folder, file + '_A_dec' + str(iteration)) #if res2_folder <> 'none': #save_matrix_fig(sorted_data2, res2_folder, file + '_A_dec' + str(iteration)) # find the best iteration, so we only save the best one -------------------------- label_name_pairs = zip(sotred_labels2, sorted_names2) if non_clustered < min_non_clusterd: res_from_diff_params[iteration] = label_name_pairs nr_clusters_from_diff_params[iteration] = n_clusters non_clustered_from_diff_params[iteration] = non_clustered distribution_from_diff_params[iteration] = num_per_cluster min_non_clusterd = non_clustered if n_clusters > 1: second_best = iteration best_iteration = iteration print 'this is best iteration currently' if bpp_like: print 'This instance was BPP-like.' break; # find the best iteration (according variance of cluster sizes), ---------------- # so we only save the best one temp_num_per_cluster = num_per_cluster.copy() if -1 in temp_num_per_cluster.keys(): del temp_num_per_cluster[-1] if len(temp_num_per_cluster.values()) > 1: std_dev = np.std(temp_num_per_cluster.values()) mean = np.mean(temp_num_per_cluster.values()) rel_std_dev = std_dev / mean rel_std_dev *= pow(non_clustered/n, 2) print 'DEBUG: adjusted rel_std_dev = ', rel_std_dev std_dev = rel_std_dev # we accept the iteration if adjusted rel_std_dev is smaller, or # if it is within the threshold and number of nonclustered points is smaller if (std_dev - min_std_dev) <= sec_threshold and non_clustered < s_min_non_clusterd: sec_criteria_fulfiled = True else: sec_criteria_fulfiled = False if std_dev < min_std_dev or sec_criteria_fulfiled: res_from_diff_params[iteration] = label_name_pairs nr_clusters_from_diff_params[iteration] = n_clusters non_clustered_from_diff_params[iteration] = non_clustered distribution_from_diff_params[iteration] = num_per_cluster min_std_dev = std_dev s_min_non_clusterd = non_clustered sec_best_iteration = iteration print 'this is second best iteration currently' # ---------------------------------------------------------------------------------- print '_______________________________________________________' best_found = False best_n_clusters = 0 best_non_clusterd = data.shape[0] best_distro = {-1:data.shape[0]} best_dec = '' # name of dec file for best iteration s_best_found = False s_best_n_clusters = 0 s_best_non_clusterd = data.shape[0] s_best_distro = {-1:data.shape[0]} s_dec = '' # name of dec file for second best iteration # save .dec from best iteration print 'best_iteration= ', best_iteration print 'sec best iteration = ', sec_best_iteration if best_iteration >= 0: best_found = True best_n_clusters = nr_clusters_from_diff_params[best_iteration] best_non_clusterd = non_clustered_from_diff_params[best_iteration] best_distro = distribution_from_diff_params[best_iteration] best_dec = file + '_dbscan_' + str(best_n_clusters) + '_' + str(best_non_clusterd) +'_dist'+str(strategy) dec.write(path = res_folder, filename = best_dec, label_name_pairs = res_from_diff_params[best_iteration]) print '.dec file %s for iteration %i saved.' %(res_folder+best_dec, best_iteration) if sec_best_iteration >= 0: if sec_best_iteration <> best_iteration: s_best_found = True s_best_n_clusters = nr_clusters_from_diff_params[sec_best_iteration] s_best_non_clusterd = non_clustered_from_diff_params[sec_best_iteration] s_best_distro = distribution_from_diff_params[sec_best_iteration] s_dec = file + '_dbscanSTD_' + str(s_best_n_clusters) + '_' + str(s_best_non_clusterd) +'_dist'+str(strategy) dec.write(path = res_folder, filename = s_dec, label_name_pairs = res_from_diff_params[sec_best_iteration]) print '.dec file %s for iteration %i saved.' %(res_folder+s_dec, sec_best_iteration) print '_______________________________________________________' print '_______________________________________________________' gc.collect() return best_found, best_n_clusters, best_non_clusterd, best_distro, best_dec, data.shape[0], \ s_best_found, s_best_n_clusters, s_best_non_clusterd, s_best_distro, s_dec
def main(): if len(sys.argv) == 2: leval_file(sys.argv[1]) loop(lambda: lprint(leval(read(), lglobals)))
def prompt(): p = "What do you do?" h = len(p)/2 q = msg(y/2,x/2-h,0.1,p,'right') i = display.inp(q) read(i)
@builtin("if", 3, is_fexpr=True) def bi_if(args, scope): if eval(args[0], scope) == LNil: return eval(args[2], scope) else: return eval(args[1], scope) @builtin("mkfunc", 4, argtypes=[LStr, None, None, LCons]) def bi_mkfunc(args, scope): return LFunc(args[0].value, not args[1].is_nil(), args[2], args[3].clone()) @builtin("fn1", 2, is_fexpr=True, argtypes=[LSym, None]) def bi_mkfunc(args, scope): return LFunc( "<fn1 lambda>", False, LCons.from_py_list([ LSym("^let"), args[0], LCons.from_py_list([LSym("^lhs"), LSym("$args")]), args[1] ]), scope) with open("test.fl", 'r') as fd: code = read(fd.read()) print("Code:", code.l_str()) result = eval(code, builtins) print("Result:", result) print("Result.l_str():", result.l_str())
def mcl(instance_path, res_folder, strategy=2): instances = instance_path.rsplit('/', 1)[0] + '/' file = instance_path.rsplit('/', 1)[1] input_type = '.' + file.rsplit('.', 1)[1] file = file.rsplit('.', 1)[0] data, row_names = parse.read(instances + file + input_type) print 'Size of data matrix: ', data.shape if len(data) <> len(row_names): print 'MCL error: data and row_names have diff. lens', len(data), len(row_names) raise Exception #save_matrix_fig(data, res_folder, file+'_in') sim_matrix = [] '''try: sim_matrix = np.load(res_folder+file+'_sim'+str(strategy)+'.npy') n = sim_matrix.shape[0] print 'Sim matrix %s found.' %(res_folder+file+'_sim'+str(strategy)+'.npy') except: print 'Sim matrix %s NOT found!!!!' %(res_folder+file+'_sim'+str(strategy)+'.npy') sim_matrix = pp.strategy(data, 'sim',strategy) np.save(res_folder+file+'_sim'+str(strategy)+'.npy', sim_matrix)''' try: sim_matrix = scipy.io.mmread(res_folder+file+'_sim'+str(strategy)).tocsr() print 'Sim matrix %s found.' %(res_folder+file+'_sim'+str(strategy)) except: print 'Sim matrix %s NOT found!!!!' %(res_folder+file+'_sim'+str(strategy)) sim_matrix = pp.strategy(data, 'sim',strategy) scipy.io.mmwrite(res_folder+file+'_sim'+str(strategy), sim_matrix) ############################################ # good visualization #draw(sim_matrix, colors=[0 for x in range(sim_matrix.shape[0])]) ############################################ old_n_clusters = 0 old_non_clustered = 0 # list to save labels from all iterations, so we can later pick the best clustering res_from_diff_params = {} nr_clusters_from_diff_params = {} non_clustered_from_diff_params = {} distribution_from_diff_params = {} best_iteration = -1 sec_best_iteration = -1 print sim_matrix.shape n = sim_matrix.shape[0] min_non_clusterd = n s_min_non_clusterd = n min_std_dev = n sec_threshold = 0.0001 iteration = 0 perfect_cl_found = False min_inf_factor = 1.1 max_inf_factor = 2.2 inf_l = max_inf_factor - min_inf_factor nr_cl_with_diff_params = [] total_time = 0 # cluster the data with MCL --------------------------------------------- for exp_iter in range(2,3): #for exp_iter in range(2,5): nr_cl_with_diff_inf = [] #for save_id, inf_iter in enumerate(np.arange(1.2, 1.85, 0.05)): for save_id, inf_iter in enumerate([1.3]): #for inf_iter in range(6): gc.collect() labels = [-1 for x in range(n)] print '#############################################################' expand_factor = exp_iter #inflate_factor = 1.2 + inf_iter*0.4 inflate_factor = inf_iter print 'DEBUG: iteration = ', iteration try: labels = np.load(res_folder+file+'_mcl_fix_'+str(save_id)+'_sim'+str(strategy)+'.npy') print 'Existing clustering found for expand = %i, inflate = %f' %(expand_factor,inflate_factor) print res_folder+file+'_mcl_fix_'+str(save_id)+'_sim'+str(strategy)+'.npy' except: print 'NO Existing clustering found for expand = %i, inflate = %f' %(expand_factor,inflate_factor) print 'Expand Factor = ', expand_factor print 'Infalte Factor = ', inflate_factor #print 'NO existing clustering found for expand = %i, inflate = %i.' %(expand_factor,inflate_factor) # inflation weakens relations between clusters and strenhtens relations in clusters # expansion operator is responsible for allowing flow to connect different regions of the graph. # bigger the expand_factor, less clusters (too big, everything is in 1 cluster) start_time = time.time() clusters = mcl_implementation(sim_matrix, expand_factor = expand_factor, max_loop = 20, inflate_factor = inflate_factor) curr_time = (time.time() - start_time) print 'curr_time: ', curr_time total_time += curr_time clust_map = {} for k, vals in clusters.items(): for v in vals: clust_map[v] = k colors = [] for i in range(n): colors.append(clust_map.get(i, 100)) set_of_colors = set(colors) print 'DEBUG: n clusters = ', len(set_of_colors) cur_label = 0 labels = colors for cluster in set_of_colors: labels = [cur_label if label == cluster else label for label in labels] cur_label += 1 labels = np.array(labels) np.save(res_folder+file+'_mcl_fix_'+str(save_id)+'_sim'+str(strategy)+'.npy', labels) # continue if all points in 1 cluster if len(set(labels)) == 1: print 'skiping this iteration...' nr_cl_with_diff_inf += [1] continue # change expand factor if all points in different clusters if len(set(labels)) >= 0.3*n: print 'moving to next expand factor...' break ############################################ # good visualization '''if iteration == 3 and file <> 'pp08aCUTS': draw(sim_matrix, colors=[label+1 for label in sotred_labels2]) if iteration == 1 and file == 'pp08aCUTS': draw(sim_matrix, colors=[label+1 for label in sotred_labels2])''' ############################################ n_clusters = len(set(labels)) - (1 if -1 in labels else 0) nr_cl_with_diff_inf += [n_clusters] num_per_cluster = {} for i in range(n_clusters): num_per_cluster[i] = 0 for label in labels: for i in range(n_clusters): if label == i: num_per_cluster[i] += 1; non_clustered = 0; for label in labels: if label == -1: non_clustered += 1 # criteria for skiping or breaking the loop --------------------------------------------- # skip the iteration if the number of clusters is as before if iteration == 0: old_n_clusters = n_clusters old_non_clustered = non_clustered if n_clusters == old_n_clusters and non_clustered == old_non_clustered and iteration > 0: continue old_n_clusters = n_clusters old_non_clustered = non_clustered # --------------------------------------------------------------------------------------- # display some information print 'Estimated number of clusters: ', n_clusters print 'Number of points per cluster: ', num_per_cluster #print 'REMOVE ME! ' #continue # --------------------------------------------------------------------------------------- sorted_data, sotred_labels, sorted_names, column_labels = sort_matrix(data, labels, row_names) # pull down the points which have non-zero value that colides with points from other clusters sorted_data2, sotred_labels2, sorted_names2 = postp.remove_colision_points2(sorted_data, sotred_labels, sorted_names, column_labels) num_per_cluster = {} n_clusters = len(set(sotred_labels2)) - (1 if -1 in sotred_labels2 else 0) if -1 in sotred_labels2: all_clusters_list = range(-1, n_clusters) else: all_clusters_list = range(n_clusters) for i in all_clusters_list: num_per_cluster[i] = 0 for label in sotred_labels2: for i in all_clusters_list: if label == i: num_per_cluster[i] += 1; non_clustered = 0; for label in sotred_labels2: if label == -1: non_clustered += 1 print 'Number of points per cluster after removal: ', num_per_cluster print 'Number of non clustered points after removal:', non_clustered # change expand factor if all points in different clusters if n_clusters >= 0.1*n: print 'moving to next expand factor...' break # save picture of end matrix #save_matrix_fig(sorted_data2, res_folder, file + '_A_dec' + str(iteration)) # find the best iteration (according to # of non clustered points), -------------- # so we only save the best one label_name_pairs = zip(sotred_labels2, sorted_names2) if non_clustered < min_non_clusterd: res_from_diff_params[iteration] = label_name_pairs nr_clusters_from_diff_params[iteration] = n_clusters non_clustered_from_diff_params[iteration] = non_clustered distribution_from_diff_params[iteration] = num_per_cluster min_non_clusterd = non_clustered best_iteration = iteration print 'this is best iteration currently' if non_clustered == 0: print 'Perfect clustering found!' perfect_cl_found = True sec_best_iteration = iteration break; # find the best iteration (according variance of cluster sizes), ---------------- # so we only save the best one temp_num_per_cluster = num_per_cluster.copy() if -1 in temp_num_per_cluster.keys(): del temp_num_per_cluster[-1] if len(temp_num_per_cluster.values()) > 1: std_dev = np.std(temp_num_per_cluster.values()) mean = np.mean(temp_num_per_cluster.values()) rel_std_dev = std_dev / mean rel_std_dev *= pow(non_clustered/n, 2) print 'DEBUG: adjusted rel_std_dev = ', rel_std_dev std_dev = rel_std_dev # we accept the iteration if adjusted rel_std_dev is smaller, or # if it is within the threshold and number of nonclustered points is smaller if (std_dev - min_std_dev) <= sec_threshold and non_clustered < s_min_non_clusterd: sec_criteria_fulfiled = True else: sec_criteria_fulfiled = False if std_dev < min_std_dev or sec_criteria_fulfiled: res_from_diff_params[iteration] = label_name_pairs nr_clusters_from_diff_params[iteration] = n_clusters non_clustered_from_diff_params[iteration] = non_clustered distribution_from_diff_params[iteration] = num_per_cluster min_std_dev = std_dev s_min_non_clusterd = non_clustered sec_best_iteration = iteration print 'this is second best iteration currently' # ---------------------------------------------------------------------------------- print '#############################################################' iteration += 1 nr_cl_with_diff_params += [nr_cl_with_diff_inf] if perfect_cl_found: break; best_found = False best_n_clusters = 0 best_non_clusterd = data.shape[0] best_distro = {-1:data.shape[0]} best_dec = '' # name of dec file for best iteration s_best_found = False s_best_n_clusters = 0 s_best_non_clusterd = data.shape[0] s_best_distro = {-1:data.shape[0]} s_dec = '' # name of dec file for second best iteration # save .dec from best iteration print 'best_iteration= ', best_iteration print 'sec best iteration = ', sec_best_iteration if best_iteration >= 0: best_found = True best_n_clusters = nr_clusters_from_diff_params[best_iteration] best_non_clusterd = non_clustered_from_diff_params[best_iteration] best_distro = distribution_from_diff_params[best_iteration] best_dec = file + '_mcl_' + str(best_n_clusters) + '_' + str(best_non_clusterd) +'_sim'+str(strategy) dec.write(path = res_folder, filename = best_dec, label_name_pairs = res_from_diff_params[best_iteration]) print '.dec file %s for iteration %i saved.' %(res_folder+best_dec, best_iteration) if sec_best_iteration >= 0: if sec_best_iteration <> best_iteration: s_best_found = True s_best_n_clusters = nr_clusters_from_diff_params[sec_best_iteration] s_best_non_clusterd = non_clustered_from_diff_params[sec_best_iteration] s_best_distro = distribution_from_diff_params[sec_best_iteration] s_dec = file + '_mclSTD_' + str(s_best_n_clusters) + '_' + str(s_best_non_clusterd) +'_sim'+str(strategy) if sec_best_iteration <> best_iteration: dec.write(path = res_folder, filename = s_dec, label_name_pairs = res_from_diff_params[sec_best_iteration]) print '.dec file %s for iteration %i saved.' %(res_folder+s_dec, sec_best_iteration) print 'Total_time: ', total_time print '_______________________________________________________' print '_______________________________________________________' gc.collect() return best_found, best_n_clusters, best_non_clusterd, best_distro, best_dec, data.shape[0], \ s_best_found, s_best_n_clusters, s_best_non_clusterd, s_best_distro, s_dec
def em(instance_path, res_folder, strategy=2): instances = instance_path.rsplit('/', 1)[0] + '/' file = instance_path.rsplit('/', 1)[1] input_type = '.' + file.rsplit('.', 1)[1] file = file.rsplit('.', 1)[0] data, row_names = parse.read(instances + file + input_type) print 'Size of data matrix: ', data.shape if len(data) <> len(row_names): print 'EM error: data and row_names have diff. lens', len(data), len(row_names) #save_matrix_fig(data, res_folder, file+'_in') old_n_clusters = 0 old_non_clustered = 0 # list to save labels from all iterations, so we can later pick the best clustering res_from_diff_params = {} nr_clusters_from_diff_params = {} non_clustered_from_diff_params = {} distribution_from_diff_params = {} best_iteration = -1 sec_best_iteration = -1 n = len(data) min_non_clusterd = n s_min_non_clusterd = n min_std_dev = n sec_threshold = 0.0001 iteration = 0 perfect_cl_found = False iteration = 0 failed = False # cluster the data with EM --------------------------------------------- for K in range(2,4): for rand_iter in range(1): gc.collect() print '#############################################################' print 'File: ', file print 'DEBUG: iteration = ', iteration print 'DEBUG: K = %i, try = %i' %(K,rand_iter) try: labels = em_implementation(data, K = K) except ValueError: print 'FAILED' failed = True break # continue if all in 1 cluster or all points in different cluster if len(set(labels)) == 1 or len(set(labels)) > 0.9 * n: #print 'DEBUG: labels: ', labels print 'skiping this iteration...' continue n_clusters = len(set(labels)) - (1 if -1 in labels else 0) num_per_cluster = {} for i in range(n_clusters): num_per_cluster[i] = 0 for label in labels: for i in range(n_clusters): if label == i: num_per_cluster[i] += 1; non_clustered = 0; for label in labels: if label == -1: non_clustered += 1 # criteria for skiping or breaking the loop --------------------------------------------- # skip the iteration if the number of clusters is as before if iteration == 0: old_n_clusters = n_clusters old_non_clustered = non_clustered #if n_clusters == old_n_clusters and non_clustered == old_non_clustered and iteration > 0: # continue old_n_clusters = n_clusters old_non_clustered = non_clustered # --------------------------------------------------------------------------------------- # display some information print 'Estimated number of clusters: ', n_clusters print 'Number of points per cluster: ', num_per_cluster # this is the case where K is too big and some clusters stay empty if np.sum(num_per_cluster.values()) <> data.shape[0]: continue #raise Exception('FUUUUCK 2') # --------------------------------------------------------------------------------------- sorted_data, sotred_labels, sorted_names, column_labels = sort_matrix(data, labels, row_names) # pull down the points which have non-zero value that colides with points from other clusters sorted_data2, sotred_labels2, sorted_names2 = postp.remove_colision_points2(sorted_data, sotred_labels, sorted_names, column_labels) num_per_cluster = {} n_clusters = len(set(sotred_labels2)) - (1 if -1 in sotred_labels2 else 0) if -1 in sotred_labels2: all_clusters_list = range(-1, n_clusters) else: all_clusters_list = range(n_clusters) for i in all_clusters_list: num_per_cluster[i] = 0 for label in sotred_labels2: for i in all_clusters_list: if label == i: num_per_cluster[i] += 1; non_clustered = 0; for label in sotred_labels2: if label == -1: non_clustered += 1 print 'Number of points per cluster after removal: ', num_per_cluster print 'Number of non clustered points after removal:', non_clustered if 0 in num_per_cluster.values(): print 'TIME TO DEBUG:' print 'sotred_labels2 = ', sotred_labels2 # save picture of end matrix #save_matrix_fig(sorted_data2, res_folder, file + '_A_dec' + str(iteration)) # find the best iteration (according to # of non clustered points), -------------- # so we only save the best one label_name_pairs = zip(sotred_labels2, sorted_names2) if non_clustered < min_non_clusterd: res_from_diff_params[iteration] = label_name_pairs nr_clusters_from_diff_params[iteration] = n_clusters non_clustered_from_diff_params[iteration] = non_clustered distribution_from_diff_params[iteration] = num_per_cluster min_non_clusterd = non_clustered best_iteration = iteration print 'this is best iteration currently' if non_clustered == 0: print 'Perfect clustering found!' perfect_cl_found = True sec_best_iteration = iteration break; # find the best iteration (according variance of cluster sizes), ---------------- # so we only save the best one temp_num_per_cluster = num_per_cluster.copy() if -1 in temp_num_per_cluster.keys(): del temp_num_per_cluster[-1] if len(temp_num_per_cluster.values()) > 1: std_dev = np.std(temp_num_per_cluster.values()) mean = np.mean(temp_num_per_cluster.values()) rel_std_dev = std_dev / mean rel_std_dev *= pow(non_clustered/n, 2) print 'DEBUG: adjusted rel_std_dev = ', rel_std_dev std_dev = rel_std_dev # we accept the iteration if adjusted rel_std_dev is smaller, or # if it is within the threshold and number of nonclustered points is smaller if (std_dev - min_std_dev) <= sec_threshold and non_clustered < s_min_non_clusterd: sec_criteria_fulfiled = True else: sec_criteria_fulfiled = False if std_dev < min_std_dev or sec_criteria_fulfiled: res_from_diff_params[iteration] = label_name_pairs nr_clusters_from_diff_params[iteration] = n_clusters non_clustered_from_diff_params[iteration] = non_clustered distribution_from_diff_params[iteration] = num_per_cluster min_std_dev = std_dev s_min_non_clusterd = non_clustered sec_best_iteration = iteration print 'this is second best iteration currently' # ---------------------------------------------------------------------------------- print '#############################################################' iteration += 1 if failed: break best_found = False best_n_clusters = 0 best_non_clusterd = data.shape[0] best_distro = {-1:data.shape[0]} best_dec = '' # name of dec file for best iteration s_best_found = False s_best_n_clusters = 0 s_best_non_clusterd = data.shape[0] s_best_distro = {-1:data.shape[0]} s_dec = '' # name of dec file for second best iteration # save .dec from best iteration print 'best_iteration= ', best_iteration print 'sec best iteration = ', sec_best_iteration if best_iteration >= 0: best_found = True best_n_clusters = nr_clusters_from_diff_params[best_iteration] best_non_clusterd = non_clustered_from_diff_params[best_iteration] best_distro = distribution_from_diff_params[best_iteration] best_dec = file + '_em_' + str(best_n_clusters) + '_' + str(best_non_clusterd) +'_str'+str(strategy) dec.write(path = res_folder, filename = best_dec, label_name_pairs = res_from_diff_params[best_iteration]) print '.dec file %s for iteration %i saved.' %(res_folder+best_dec, best_iteration) if sec_best_iteration >= 0: if sec_best_iteration <> best_iteration: s_best_found = True s_best_n_clusters = nr_clusters_from_diff_params[sec_best_iteration] s_best_non_clusterd = non_clustered_from_diff_params[sec_best_iteration] s_best_distro = distribution_from_diff_params[sec_best_iteration] s_dec = file + '_emSTD_' + str(s_best_n_clusters) + '_' + str(s_best_non_clusterd) +'_str'+str(strategy) dec.write(path = res_folder, filename = s_dec, label_name_pairs = res_from_diff_params[sec_best_iteration]) print '.dec file %s for iteration %i saved.' %(res_folder+s_dec, sec_best_iteration) print '_______________________________________________________' print '_______________________________________________________' gc.collect() return best_found, best_n_clusters, best_non_clusterd, best_distro, best_dec, data.shape[0], \ s_best_found, s_best_n_clusters, s_best_non_clusterd, s_best_distro, s_dec