def main(): ''' Main function for PAM ''' if len(sys.argv) != 3: print('Error: invalid number of parameters') return(1) # Get the parameters filePath = sys.argv[1] k = int(sys.argv[2]) if debugEnabled == True: print('filePath: ', filePath) print('k: ', k) # Run PAM for europe.txt data = importData(filePath) if debugEnabled == True: for i in range(10): print('data=', data[i]) # Add timing here startTime = time.time() best_cost, best_choice, best_medoids = kmedoids(data, k) endTime = time.time() print('best_time: ', endTime - startTime) print('best_cost: ', best_cost) print('best_choice: ', best_choice) print('best_medoids: ', best_medoids)
def main(): ''' Main function for PAM ''' if len(sys.argv) != 3: print('Error: invalid number of parameters') return (1) # Get the parameters filePath = sys.argv[1] k = int(sys.argv[2]) if debugEnabled == True: print('filePath: ', filePath) print('k: ', k) # Run PAM for europe.txt data = importData(filePath) if debugEnabled == True: number = len(data) if number > 10: number = 10 for i in range(number): print('data=', data[i]) # Add timing here startTime = time.time() best_cost, best_choice, best_medoids = kmedoids(data, k) endTime = time.time() print('best_time: ', endTime - startTime) print('best_cost: ', best_cost) print('best_choice: ', best_choice) print('best_medoids: ', best_medoids)
def main(): ''' Main function for Parallele PAM ''' if len(sys.argv) != 4: print('Error: invalid number of parameters') return (1) # Get the parameters filePath = sys.argv[1] k = int(sys.argv[2]) t = int(sys.argv[3]) if debugEnabled == True: print('filePath: ', filePath) print('k: ', k) print('t: ', t) # Run PAM for europe.txt data = util.importData(filePath) if debugEnabled == True: for i in range(10): print('data=', data[i]) # Check the timing startTime = time.time() best_cost, best_choice, best_medoids = kmedoids_parallel(data, k, t) endTime = time.time() print('best_time: ', endTime - startTime) print('best_cost: ', best_cost) print('best_choice: ', best_choice) print('best_medoids: ', best_medoids)
def main(): ''' Main function for Clara ''' print "sys.argv: ", sys.argv print "len(sys.argv): ", len(sys.argv) #if len(sys.argv) not in [4,6,7,8]: # print('Error: invalid number of parameters. Your parameters should be: \n path_to_node_names k cost_type [path_to_distance_matrix [path_to_similarity_matrix] path_to_edge_matrix_(affinity)]') # return(1) if len(sys.argv) == 4 and sys.argv[3] == 3: print('Error: cost based on distance/similarity matrix without the matrix specified') return(1) # Get the parameters try: splittedSim = sys.argv[8] except: splittedSim = False print "no value for splittedSim parameter specified, so it is set to False" filePath = sys.argv[1] k = int(sys.argv[2]) COST = int(sys.argv[3]) if len(sys.argv) >= 6: if COST == 3: distDictPath = sys.argv[4] affinityPath = sys.argv[5] print "distDictPath: ", distDictPath print "affinityPath: ", affinityPath elif COST == 4: distDictPath = sys.argv[4] simDictPath = sys.argv[5] affinityPath = sys.argv[6] try: acceleration = int(sys.argv[7]) print "acceleration degree: ", acceleration except: acceleration = 0 print "distDictPath: ", distDictPath print "simDictPath: ", simDictPath print "affinityPath: ", affinityPath else: print "Error: I dunno if you pass 'similarity' (COST=4) or you pass 'distance matrix' (COST=3)" return(1) #affinityPath = sys.argv[5] if debugEnabled == True: print 'filePath: ', filePath print 'k: ', k print "Cost Function: ", COST # Run Clara if COST in [3,4]: affinitiesOur = importData(affinityPath, ifjson=1) print "\n affinities imported" data = importData(filePath, ifjson=1) distDictOur = importData(distDictPath, ifjson=1) #distDictOur = None print "\n pairwise distances imported" if COST == 4: #simDictOur = importData(simDictPath, ifjson=1) if not splittedSim: simDictOur = {} with open(simDictPath, 'r') as S: for line in S: simDictOur.update(json.loads(line)) else: simDictOur = load_splitted_sim(simDictPath, coresLoaded) print "\n pairwise similarities imported\n" else: data = importData(filePath) if debugEnabled == True: for i in range(10): print('example_data=', data[i]) # Add timing startTime = time.time() if COST == 3: best_cost, best_choice, best_medoids, cost_list, isolates = clara( data, k, COST, distDictClara=distDictOur, simDictClara={}, affinities=affinitiesOur, saveAllResults=False, acceleration=acceleration, take_all_nodes=takeAllNodes) elif COST == 4: best_cost, best_choice, best_medoids, cost_list, isolates = clara( data, k, COST, distDictClara=distDictOur, simDictClara=simDictOur, affinities=affinitiesOur, saveAllResults=False, acceleration=acceleration, take_all_nodes=takeAllNodes) else: best_cost, best_choice, best_medoids, cost_list, isolates = clara( data, k, COST, saveAllResults=False, take_all_nodes=takeAllNodes) endTime = time.time() mod = None harmonic_centrality = None ''' #don't delete!!!!!!!!!!!!~~~~~ # Compute modularity and display it startMod = time.time() if COST in [3,4]: mod = modularity(data, COST=COST, distDict=distDictOur, edgeDict=affinitiesOur, medoids=best_medoids) else: print "no modularity for this regime, sorry" pass endMod = time.time() # comment up to this ~~~~~~~~~~ startCentr = time.time() harmonic_centrality = intra_cluster_centrality(data, COST=COST, distDict=distDictOur, medoids=best_medoids) endCentr = time.time() ''' # Save the result fordump = [best_cost, best_choice, best_medoids, mod, harmonic_centrality, isolates] json_filename = "clara_json_version_" + str(int(time.time())/10)[3:] + "_one" #time.strftime("%d %b %Y %H:%M:%S", time.gmtime()) json.dump(fordump, open(json_filename, "w")) # Print the result diff = (endTime - startTime) best_cluster_length = [] for i in best_choice: best_cluster_length.append(len(best_medoids[i])) print '\n\n' print 'best_time: ', diff #print 'best_modularity_time: ', endMod - startMod #print 'best_centrality_time: ', endCentr - startCentr #print 'best_modularity: ', mod print 'best_cost: ', best_cost print 'best_choice: ', best_choice print 'best_cluster_lengths: ', best_cluster_length print 'clustered_nodes: ', sum(best_cluster_length) #print 'all_nodes: ', len(data) print 'isolates: ', len(isolates), isolates print '\n\n' print 'best_medoids: ', best_medoids print '\n\n'
def main(): ''' Main function for Clara ''' print "sys.argv: ", sys.argv print "len(sys.argv): ", len(sys.argv) #if len(sys.argv) not in [4,6,7,8]: # print('Error: invalid number of parameters. Your parameters should be: \n path_to_node_names k cost_type [path_to_distance_matrix [path_to_similarity_matrix] path_to_edge_matrix_(affinity)]') # return(1) if len(sys.argv) == 4 and sys.argv[3] == 3: print( 'Error: cost based on distance/similarity matrix without the matrix specified' ) return (1) # Get the parameters try: splittedSim = sys.argv[8] except: splittedSim = False print "no value for splittedSim parameter specified, so it is set to False" filePath = sys.argv[1] k = int(sys.argv[2]) COST = int(sys.argv[3]) if len(sys.argv) >= 6: if COST == 3: distDictPath = sys.argv[4] affinityPath = sys.argv[5] print "distDictPath: ", distDictPath print "affinityPath: ", affinityPath elif COST == 4: distDictPath = sys.argv[4] simDictPath = sys.argv[5] affinityPath = sys.argv[6] try: acceleration = int(sys.argv[7]) print "acceleration degree: ", acceleration except: acceleration = 0 print "distDictPath: ", distDictPath print "simDictPath: ", simDictPath print "affinityPath: ", affinityPath else: print "Error: I dunno if you pass 'similarity' (COST=4) or you pass 'distance matrix' (COST=3)" return (1) #affinityPath = sys.argv[5] if debugEnabled == True: print 'filePath: ', filePath print 'k: ', k print "Cost Function: ", COST # Run Clara if COST in [3, 4]: affinitiesOur = importData(affinityPath, ifjson=1) print "\n affinities imported" data = importData(filePath, ifjson=1) distDictOur = importData(distDictPath, ifjson=1) #distDictOur = None print "\n pairwise distances imported" if COST == 4: #simDictOur = importData(simDictPath, ifjson=1) if not splittedSim: simDictOur = {} with open(simDictPath, 'r') as S: for line in S: simDictOur.update(json.loads(line)) else: simDictOur = load_splitted_sim(simDictPath, coresLoaded) print "\n pairwise similarities imported\n" else: data = importData(filePath) if debugEnabled == True: for i in range(10): print('example_data=', data[i]) # Add timing startTime = time.time() if COST == 3: best_cost, best_choice, best_medoids, cost_list, isolates = clara( data, k, COST, distDictClara=distDictOur, simDictClara={}, affinities=affinitiesOur, saveAllResults=False, acceleration=acceleration, take_all_nodes=takeAllNodes) elif COST == 4: best_cost, best_choice, best_medoids, cost_list, isolates = clara( data, k, COST, distDictClara=distDictOur, simDictClara=simDictOur, affinities=affinitiesOur, saveAllResults=False, acceleration=acceleration, take_all_nodes=takeAllNodes) else: best_cost, best_choice, best_medoids, cost_list, isolates = clara( data, k, COST, saveAllResults=False, take_all_nodes=takeAllNodes) endTime = time.time() mod = None harmonic_centrality = None ''' #don't delete!!!!!!!!!!!!~~~~~ # Compute modularity and display it startMod = time.time() if COST in [3,4]: mod = modularity(data, COST=COST, distDict=distDictOur, edgeDict=affinitiesOur, medoids=best_medoids) else: print "no modularity for this regime, sorry" pass endMod = time.time() # comment up to this ~~~~~~~~~~ startCentr = time.time() harmonic_centrality = intra_cluster_centrality(data, COST=COST, distDict=distDictOur, medoids=best_medoids) endCentr = time.time() ''' # Save the result fordump = [ best_cost, best_choice, best_medoids, mod, harmonic_centrality, isolates ] json_filename = "clara_json_version_" + str( int(time.time()) / 10)[3:] + "_one" #time.strftime("%d %b %Y %H:%M:%S", time.gmtime()) json.dump(fordump, open(json_filename, "w")) # Print the result diff = (endTime - startTime) best_cluster_length = [] for i in best_choice: best_cluster_length.append(len(best_medoids[i])) print '\n\n' print 'best_time: ', diff #print 'best_modularity_time: ', endMod - startMod #print 'best_centrality_time: ', endCentr - startCentr #print 'best_modularity: ', mod print 'best_cost: ', best_cost print 'best_choice: ', best_choice print 'best_cluster_lengths: ', best_cluster_length print 'clustered_nodes: ', sum(best_cluster_length) #print 'all_nodes: ', len(data) print 'isolates: ', len(isolates), isolates print '\n\n' print 'best_medoids: ', best_medoids print '\n\n'
def main(): ''' Main function for PAM ''' print sys.argv print len(sys.argv) if len(sys.argv) == 4 and sys.argv[3] == 3: print 'Error: cost based on distance matrix without distance matrix specified' return (1) if len(sys.argv) not in [4, 5]: print 'Error: invalid number of parameters. Your parameters should be: \n path_to_node_names k cost_type [pairwise_matrix_(distance_or_similarity)]' return (1) # Get the parameters filePath = sys.argv[1] k = int(sys.argv[2]) COST = int(sys.argv[3] ) # here it's obligatory, but in kmedoids(), optional. FIX IT if len(sys.argv) == 5: if COST == 4: # k-medoids based on similarity matrix (e.g Jaccard score) simDictPath = sys.argv[4] print simDictPath elif COST == 3: # k-medoids based on distance matrix (e.g. Average shortest path) distDictPath = sys.argv[4] print distDictPath # distDictPath is not a file - it's a path to file (string) else: print "Error: I dunno whether you pass affinities to compute 'similarity' (COST=4) or you pass 'distance matrix' (COST=3)" return (1) if debugEnabled == True: print('filePath: ', filePath) print('k: ', k) print('cost function number: ', COST ) # better yet, display the name, not number FIX IT # Run PAM for europe.txt distDictOur = {} simDictOur = {} if COST == 3: data = importData( filePath, ifjson=1 ) # actually, ifjson=1 is not tantamount to direct distance method FIX IT distDictOur = importData(distDictPath, ifjson=1) # print "distance: ", distDict.items()[0] elif COST == 4: data = importData(filePath, ifjson=1) simDictOur = importData(simDictPath, ifjson=1) print "pairwise similarities imported" else: data = importData(filePath) if debugEnabled: for i in range(10): print('data=', data[i]) # Add timing here startTime = time.time() if COST not in [3, 4]: best_cost, best_choice, best_medoids = kmedoids(data, k, COST) elif COST == 3: best_cost, best_choice, best_medoids = kmedoids(data, k, COST, distDictOur, simDictKM={}) elif COST == 4: best_cost, best_choice, best_medoids = kmedoids(data, k, COST, distDictKM={}, simDictKM=simDictOur) endTime = time.time() # Saving the result into new file fordump = [best_cost, best_choice, best_medoids] json_filename = "pam_json_version " + str( int(time.time()) / 10)[3:] # find normal time format FIX IT json.dump(fordump, open(json_filename, "w")) best_cluster_length = [] for i in best_choice: try: best_cluster_length.append(len(best_medoids[i])) except KeyError: best_cluster_length.append("KeyError ;)") print 'best_time: ', (endTime - startTime) print 'best_cost: ', best_cost print 'best_choice: \n', best_choice print 'best_cluster_lengths: \n', best_cluster_length print 'best_cluster_contents: \n', best_medoids
def main(): # parse arguments args = read_arguments() data = [] affinities = {} simDict = {} if args.distDict != None: distDict = importData(args.distDict, ifjson=1) print "\n pairwise distances imported" else: distDict = {} if not args.loadedCores: data = importData(args.data, ifjson=1) affinities = importData(args.edgeDict, ifjson=1) print "\n affinities imported" simDict = importData(args.simDict, ifjson=1) print "\n pairwise similarities imported\n" #~~~~~~~~~~~~ method for finding the optimal K ~~~~~~~~~~~# if args.method == 'kopt': if args.trials_decay: trials_decay = True else: trials_decay = False if args.KlastSeq: if args.Kupper: Klist = range(1, args.KlastSeq + 1) + args.Kupper else: Klist = range(1, args.KlastSeq + 1) Klist.append(args.Kmax) else: Klist = range(1, args.Kmax + 1) startTime = time.time() Kopt, mod_lists, modStdev, modMean, modMax = ModularityProfile( data, Kmin=args.Kmin, Kmax=args.Kmax, Klist=Klist, edgeDict=affinities, simDict=simDict, bagSize=args.bagSize, trials=args.trials, trials_decay=trials_decay) endTime = time.time() fordump = [Kopt, mod_lists, modStdev, modMean, modMax] json_filename = "kopt_json_version_" + Version json.dump(fordump, open(json_filename, "w")) print '\n\n' print 'saved as: ', json_filename print 'time: ', endTime - startTime print 'optimal K: ', Kopt print 'modMax: ', modMax for i in xrange(len(Klist)): print "mod_list for k=%d is: %s" % (Klist[i], mod_lists[i]) print '\n\n' #~~~~~~~~~~~ method for averaging clustering results ~~~~~~~~~~~# if args.method == "cores": print "gotcha" result = SGJRIcores(data=data, K=args.K, edgeDict=affinities, simDict=simDict, distDict=distDict, bagSize=args.bagSize, trials=args.trials, threshold=args.threshold, loadedCores=args.loadedCores, filepathCores=args.filepathCores, dendroFormat=args.dendroFormat, acceleration=int(args.acceleration)) if 0 == 0: #args.distDict != None: clustData, commonCluster, treeStruct, segmentDict, harmCentr = result[: -1] mongo = result[-1] if args.saveCommonClusters: fordump = [ clustData, commonCluster, treeStruct, segmentDict, harmCentr, mongo ] else: fordump = [ clustData, ['commonCluster_placeholder'], treeStruct, segmentDict, harmCentr, mongo ] else: clustData, commonCluster, treeStruct, segmentDict = result fordump = [clustData, commonCluster, treeStruct, segmentDict] pkl_filename = "cores_pickled_version_" + Version with open(pkl_filename, 'w') as f: pickle.dump(fordump, f, pickle.HIGHEST_PROTOCOL) print "Saved as: ", pkl_filename
def main(): ''' Main function for PAM ''' print sys.argv print len(sys.argv) if len(sys.argv) == 4 and sys.argv[3] == 3: print 'Error: cost based on distance matrix without distance matrix specified' return (1) if len(sys.argv) not in [4,5]: print 'Error: invalid number of parameters. Your parameters should be: \n path_to_node_names k cost_type [pairwise_matrix_(distance_or_similarity)]' return (1) # Get the parameters filePath = sys.argv[1] k = int(sys.argv[2]) COST = int(sys.argv[3]) # here it's obligatory, but in kmedoids(), optional. FIX IT if len(sys.argv) == 5: if COST == 4: # k-medoids based on similarity matrix (e.g Jaccard score) simDictPath = sys.argv[4] print simDictPath elif COST == 3: # k-medoids based on distance matrix (e.g. Average shortest path) distDictPath = sys.argv[4] print distDictPath # distDictPath is not a file - it's a path to file (string) else: print "Error: I dunno whether you pass affinities to compute 'similarity' (COST=4) or you pass 'distance matrix' (COST=3)" return(1) if debugEnabled == True: print('filePath: ', filePath) print('k: ', k) print('cost function number: ', COST) # better yet, display the name, not number FIX IT # Run PAM for europe.txt distDictOur = {} simDictOur = {} if COST == 3: data = importData(filePath, ifjson=1) # actually, ifjson=1 is not tantamount to direct distance method FIX IT distDictOur = importData(distDictPath, ifjson=1) # print "distance: ", distDict.items()[0] elif COST == 4: data = importData(filePath, ifjson=1) simDictOur = importData(simDictPath, ifjson=1) print "pairwise similarities imported" else: data = importData(filePath) if debugEnabled: for i in range(10): print('data=', data[i]) # Add timing here startTime = time.time() if COST not in [3,4]: best_cost, best_choice, best_medoids = kmedoids(data, k, COST) elif COST == 3: best_cost, best_choice, best_medoids = kmedoids(data, k, COST, distDictOur, simDictKM={}) elif COST == 4: best_cost, best_choice, best_medoids = kmedoids(data, k, COST, distDictKM={}, simDictKM=simDictOur) endTime = time.time() # Saving the result into new file fordump = [best_cost, best_choice, best_medoids] json_filename = "pam_json_version " + str(int(time.time())/10)[3:] # find normal time format FIX IT json.dump(fordump, open(json_filename, "w")) best_cluster_length = [] for i in best_choice: try: best_cluster_length.append(len(best_medoids[i])) except KeyError: best_cluster_length.append("KeyError ;)") print 'best_time: ', (endTime - startTime) print 'best_cost: ', best_cost print 'best_choice: \n', best_choice print 'best_cluster_lengths: \n', best_cluster_length print 'best_cluster_contents: \n', best_medoids
from util import importData from util import CLASS_MAP import pandas as pd # Datasets for machine learning tasks #--------------------------------------------------------------------------------------------------------------------- #Import and process raw data df = importData("data-raw/winemag-data_first150k.csv", censor=True, filter=True, processDescriptions=True) #Keep only necessary columns df_keep = df[['index', 'color', 'class', 'description']] #Parameters to subset data num_test = 20000 num_dev = 20000 num_train = df_keep.shape[0] - num_test - num_dev assert num_train > 0 #Subset Data data = df_keep.sample(frac=1, replace=False, random_state=1415926).reset_index(drop=True) data_test = data.iloc[0:num_test] data_dev = data.iloc[num_test:(num_test + num_dev)] data_train = data.iloc[(num_test + num_dev):(num_test + num_dev + num_train)] #Save to disk data_test.to_csv('data-processed/data.test', index=False)