def cluster_data(data, ov_min=10, ov_min1=0.2): cl = Clustering(ov_min, ov_min1) cl.fill_clusters(data['x1'], data['x2']) c_labels = cl.clabels nhits = len(c_labels) n_clust = cl.ncl x1cl = cl.cluster_x1 x2cl = cl.cluster_x2 ycl = [] pcentcl = [] namecl = [] detailcl = [] x1tcl = [] x2tcl = [] dxtcl = [] for i in range(0,n_clust): ycl.append(float(i+1)/2.) for j in range(0,nhits): if c_labels[j]==i: pcentcl.append(data['pcent'][j]) namecl.append(data['name'][j]) detailcl.append(data['detail'][j]) x1tcl.append(data['x1t'][j]) x2tcl.append(data['x2t'][j]) dxtcl.append(data['dxt'][j]) break new_data = dict(x1=x1cl, x2=x2cl, dx=[end-beg for beg,end in zip(x1cl,x2cl)], xm=[(beg+end)/2 for beg,end in zip(x1cl,x2cl)], x1t=x1tcl, x2t=x2tcl, dxt=dxtcl, y=ycl, nhits=cl.nhits, name=namecl, pcent=pcentcl, detail=detailcl) return new_data, n_clust
def generate(self, keys, url): json_work("other_files/work_file.json", "w", []) # обнуляем work print(f'Ключей получено: {len(keys)}') if len(keys) > 0: self.generate_pretmp( keys ) # генерация претемплейтов по ключам c уникальным stemming print(f'Ключей после удаления дублей: {len(self.work_file)}') time.sleep(2) if len(self.work_file) > 0: with ThreadPoolExecutor(5) as executor: for _ in executor.map(self.template_generated, self.work_file): pass work = json_work("other_files/work_file.json", "r") if len(work) > 0: gen_data = sorted(work, key=lambda x: x["frequency"]["basic"], reverse=True) json_work("other_files/work_file.json", "w", gen_data) gen_data += json_work("other_files/main.json", "r") gen_data = sorted(gen_data, key=lambda x: x["frequency"]["basic"], reverse=True) json_work("other_files/main.json", "w", gen_data) print(f"url {url} обработан") clustering = Clustering( json_work("other_files/work_file.json", "r"), url) clustering.run() else: print("Перехожу к следующему url") return
def get_decomp(self, method='MDS', **kwargs): optioncheck(method, ['MDS', 'spectral']) cl = Clustering(self.dm) if method == 'MDS': return cl.MDS_decomp() if method == 'spectral': return cl.spectral_decomp(**kwargs)
def __init__(self): self.nwalkers = 32 self.ndim = 7 filename = "tutorial.h5" backend = emcee.backends.HDFBackend(filename) backend.reset(self.nwalkers, self.ndim) hod_params = {"M_min": 0, "galaxy_density": 0.00057, "boxsize": 1000, "log_halo_mass_bins": np.arange(10,15,0.1), \ "halo_histo": np.loadtxt("../data/halo_central_histo.dat")} halofile = "../../ELG_HOD_optimization/data/halo_M200b_0.54980_for_mock.dat" self.mockfactory = MockFactory(halofile, boxsize=1000, cvir_fac=1, hod_parameters=hod_params) # clustering calculator rbins = np.logspace(np.log10(0.1), np.log10(70), 21) self.cluster = Clustering(rbins) # read xi and wp from data, read cov matrix self.clustering_data = np.loadtxt("../data/clustering_data.dat") self.scaled_cov = np.loadtxt("../data/scaled_cov.dat") #with Pool(10) as pool: # self.sampler = emcee.EnsembleSampler(self.nwalkers, self.ndim, self.log_prob, backend=backend, pool = pool) # self.run() self.sampler = emcee.EnsembleSampler(self.nwalkers, self.ndim, self.log_prob, backend=backend) self.run()
def perform_clustering( term_ids_to_embs: Dict[int, List[float]]) -> Dict[int, Set[int]]: """Cluster the given terms into 5 clusters. Args: term_ids_to_embs: A dictionary mapping term-ids to their embeddings. Return: A dictionary of mapping each cluster label to its cluster. Each cluster is a set of term-ids. """ # Case less than 5 terms to cluster. num_terms = len(term_ids_to_embs) if num_terms < 5: clusters = {} for i, tid in enumerate(term_ids_to_embs): clusters[i] = {tid} return clusters # Case more than 5 terms to cluster. c = Clustering() term_ids_embs_items = [(k, v) for k, v in term_ids_to_embs.items()] results = c.fit([it[1] for it in term_ids_embs_items]) labels = results['labels'] print(' Density:', results['density']) clusters = defaultdict(set) for i in range(len(term_ids_embs_items)): term_id = term_ids_embs_items[i][0] label = labels[i] clusters[label].add(term_id) return clusters
def main(args): #-----------------------------------------------------# # 2D/3D Convolutional Autoencoder # #-----------------------------------------------------# if args.program == 'CAE': cae = CAE(input_dir=args.data_dir, patch_size=ast.literal_eval(args.patch_size), batch_size=args.batch_size, test_size=args.test_size, prepare_batches=args.prepare_batches) cae.prepare_data(args.sampler_type, args.max_patches, args.resample, ast.literal_eval(args.patch_overlap), args.min_lab_vox, args.label_prob, args.load_data) if args.model_dir is None: cae.train(args.epochs) cae.predict(args.model_dir) #-----------------------------------------------------# # Patient classification # #-----------------------------------------------------# """ if args.program=='AutSeg': asg = AutomaticSegmentation( model_name=args.model_name, patch_size=args.patch_size, patch_overlap=args.patch_overlap, input_dir=args.data_dir, model_dir=args.model_dir ) asg.run() asg.run_postprocessing() """ if args.program == 'CLUS': clustering = Clustering(num_iters=args.iterations, num_clusters=args.num_clusters, input_dir=args.data_dir) clustering.run() if args.program == 'FeEx': fe = FeatureExtraction(model_name=args.model_name, patch_size=ast.literal_eval(args.patch_size), patch_overlap=ast.literal_eval( args.patch_overlap), num_clusters=args.num_clusters, cluster_selection=args.cluster_selection, resample=args.resample, encoded_layer_num=args.encoded_layer_num, model_dir=args.model_dir, input_dir=args.data_dir) fe.run(batch_size=20) if args.program == 'SVM': svm = SvmClassifier(feature_dir=args.feature_dir, ffr_dir=args.ffr_dir, ffr_filename=args.ffr_filename, input_dir=args.data_dir, ffr_cut_off=args.ffr_cut_off, test_size=args.test_size) svm.train() svm.predict()
def get_cv_cpv(x: str, percent: float) -> float: global model_goal # Get dataset number dataset_num = get_dataset_num(x) # Get number of pcs for CPV > 0.8 and CPV > 0.99 if percent == 0.99: pcs_cpv = df_selection.loc[dataset_num, "Cum. Perc. Var. (0.99)"] else: pcs_cpv = df_selection.loc[dataset_num, "Cum. Perc. Var. (0.8)"] # Get df_results df = pd.read_csv(x) idx = df.features_kept == pcs_cpv try: return df.loc[idx].cv.values[0] except: inputs = Inputs(paths) inputs.random_seed = 1969 inputs.get_df_split(dataset_num) pca_model = get_pca_model(inputs) cluster_model = Clustering(inputs.num_cluster, 100, inputs.random_seed) cluster_model.fit(pca_model.pcs_train.loc[:, :pcs_cpv - 1]) cluster_prediction = cluster_model.predict( pca_model.pcs_test.loc[:, :pcs_cpv - 1]) cluster_performances = cluster_model.get_cluster_performances( inputs.df_test.copy(), cluster_prediction, pcs_cpv, inputs.num_cluster, model_goal=model_goal) return variation(cluster_performances)
def dump_clusters(): args = get_args() if args['-train'] == '': args['-train'] = 'src/resources/output' + args['-k'] w2vobj = W2V(args['-input'], args['-train'], args['-k']) news = News() articles = news.get_articles() w2vobj.train() # Sentence vectorization by averaging article_vecs = [w2vobj.get_sentence_vector_avg(article['cleaned_title']) for article in articles] # Sentence vectorization by "newtonian" method '''article_vecs = [] for article in articles: newtonian_vec = w2vobj.get_sentence_vector_newtonian(article['cleaned_title']) if newtonian_vec is not None: article_vecs.append(newtonian_vec)''' cluster_obj = Clustering(article_vecs, w2vobj) r_conn = redis.from_url(os.getenv('REDIS_URL',"redis://localhost:6379/")) if args['-cluster'] == 'agg': if args['-prune'] == 'true' or args['-prune'] == 'True': utilities.redis_kmeans_clusters(cluster_obj, articles, True, int(args['-limit']), r_conn) print("redis dump complete") else: utilities.redis_kmeans_clusters(cluster_obj, articles, False, int(args['-limit']), r_conn) print("redis dump complete") else: #TODO dump to redis utilties.print_ann_clusters(cluster_obj, articles)
def init_run(self, run_params): if self.experiment_type == self.CLASSIFICATION: return MultiClassClassification( method_name=self.method_name, dataset_name=self.dataset_name, performance_function=self.performance_function, embeddings=self.node_embeddings, **run_params, node_labels=self.node_labels, node2id_filepath=self.node2id_filepath) elif self.experiment_type == self.CLUSTERING: return Clustering(method_name=self.method_name, dataset_name=self.dataset_name, embeddings=self.node_embeddings, **run_params, node_labels=self.node_labels, performance_function=self.performance_function, node2id_filepath=self.node2id_filepath) elif self.experiment_type == self.MULTI_LABEL_CLASSIFICATION: return MultiLabelClassification( method_name=self.method_name, dataset_name=self.dataset_name, node_labels=self.node_labels, **run_params, performance_function=self.performance_function, embeddings=self.node_embeddings, node2id_filepath=self.node2id_filepath) elif self.experiment_type == self.LINK_PREDICTION: return LinkPrediction( method_name=self.method_name, dataset_name=self.dataset_name, node_embeddings=self.node_embeddings, **run_params, performance_function=self.performance_function, node2id_filepath=self.node2id_filepath)
def main(): #X = [[1, 1], [1, 2],[1,3], [4, 4],[4, 5], [5, 4], [5, 5], [10, 9], [10,10], [20,19], [20, 20]] X, Y = make_blobs(n_samples=5000, centers=10, cluster_std=0.60, random_state=0) cluster = Clustering(X.tolist()) cluster.buildTree(cluster.root) cluster.createLevelMatrix(cluster.root) cluster.createDistanceMatrix(numberOfCluster, numberOfLevels) query = [0, 0] start = timeit.default_timer() # Your statements here print("aug", aug_mmr(cluster, 0.5, query, X, 15)) stop = timeit.default_timer() print('Time for aug mmr: ', stop - start) start = timeit.default_timer() # Your statements here print("mmr", _mmr(0.5, query, X, 15)) stop = timeit.default_timer() print('Time for mmr: ', stop - start)
def full_realtime(precompute_fraction=.4, nqueries=50000, ndataunits=100000, nmachines=50, r=3, np=.995, min_q_len=6, max_q_len=15, ctype='fast', gcpatype='better'): g = Graph.Erdos_Renyi(n=ndataunits, p=np / ndataunits) queries = [] q = 0 while q < nqueries: node = random.randint(0, ndataunits - 1) line = iterative_dfs(g, node, path=[]) if len(line) >= min_q_len: queries.append(line) q += 1 graphfile = 'n' + str( len(queries) / 1000) + 'np' + str(np) + ctype + gcpatype + 'test' with open(graphfile + '.csv', 'wb') as f: w = csv.writer(f) for line in queries: w.writerow(line) print 'Queries generated', len(queries) infile = graphfile max_to_process = min(nqueries, len(queries)) queries = queries[:max_to_process] pre_computed = queries[:int(precompute_fraction * len(queries))] machines = generate(range(ndataunits), nmachines) dataunit_in_machine = generate_hash(machines, ndataunits) clustering = Clustering(pre_computed) rt_queries = queries[len(pre_computed):] if gcpatype == 'linear': gcpa_data = GCPA(clustering, ndataunits) elif gcpatype == 'better': gcpa_data = GCPA_better(clustering, ndataunits) gcpa_data.process(machines, dataunit_in_machine) rt_covers = [] for idx, query in enumerate(rt_queries): oldlen = len(query) if (idx % 1000) == 0: print 'Query: ', idx cover, gcpa_dt = rt_query_process(query, clustering, gcpa_data, machines, dataunit_in_machine, ctype) rt_covers.append(cover) return gcpa_data.covers, rt_covers
def kga(data, k, random_state=None): rand.seed(random_state) problem = Clustering(data) centroids, _, _ = genetic(problem, k, t_pop=10, taxa_cross=0.95, taxa_mutacao=0.2) return centroids
def clusterize(self): print("\nclusterize") self.process_clustering_data() c = Clustering(self.clustering_df) c.k_means(2) c.k_means(3) c.k_means(4)
def setup(source, pdf_path): ngrams = NGramSpace(4) print "parsing documents at %s..." % source docs = [ extract_row(row, pdf_path, ngrams) for row in csv.DictReader(open(source, 'r')) ] print "clustering %d documents..." % len(docs) clustering = Clustering([doc.parsed for doc in docs]) return (clustering, docs)
def cluster(self, shapelets): """ Uses a clustering algorithm to reduce the number of shapelets. :param shapelets: list of shapelet candidates :type shapelets: np.array, shape = (len(shapelets), len(s), len(dim(s))) :return: list of remaining shapelet candidates :rtype np.array, shape = (|remaining candidates|, len(s), len(dim(s))) """ clustering = Clustering(self.d_max) clustering.fit(shapelets) return clustering.nn_centers()
def test_pairs(self): ngrams = NGramSpace(1) docs = [ngrams.parse(raw) for raw in test_docs] c = Clustering(docs) self.assertEqual((1, 0), c.closest_pair([0, 1, 2])) self.assertEqual((5, 3), c.closest_pair([3, 4, 5])) self.assertEqual((7, 6), c.closest_pair([6, 7])) self.assertEqual((2, 0), c.farthest_pair([0, 1, 2])) self.assertEqual((5, 4), c.farthest_pair([3, 4, 5])) self.assertEqual((7, 6), c.farthest_pair([6, 7]))
def clustering(x, df, n_clusters=10, distance='angular', method='K-medians'): """ Do the clustering, based on the 91 features. Args: x: array of features df: dataframe of features n_clusters: number of clusters distance: could be 'angular' or 'euclidean'; method: could be 'K-medians', 'K-means', 'Hierarchical' Output: new_df: the labeled dataframe, according to the clustering algorithm relevant_features_cs: a list with the relevant features (angles of the consecutive limbs) of the centroids cs: dictionary with the centroid features """ relevant_features_id = [ 0, 3, 5, 13, 15, 17, 25, 46, 47, 56, 64, 65, 76, 77, 83, 85, 90 ] keys_dict = [ '0-1', '0-4', '0-6', '1-2', '1-4', '1-6', '2-3', '4-5', '4-6', '5-7', '6-8', '6-9', '8-9', '8-10', '9-12', '10-11', '12-13' ] clustering_ = Clustering(k=n_clusters, distance=distance, method=method) cs, cls = clustering_.fit(x) assert len(list(cls.keys())) == n_clusters d = pd.DataFrame() l = [] for i in range(n_clusters): df1 = pd.DataFrame(cls[i]) d = pd.concat([d, df1], sort=False) l += [i] * len(cls[i]) d.columns = df.columns d.insert(91, 'label', l) new_df = df.reset_index().merge(d).set_index('index') relevant_features_cs = [] if method == 'Hierarchical': pass else: for i in range(len(cs)): d = {} cs_rf = cs[i][relevant_features_id] for k in range(len(keys_dict)): d[keys_dict[k]] = cs_rf[k] relevant_features_cs.append(d) return new_df, relevant_features_cs, cs
def test_distance(self): raw_docs = ['a b c', 'b c d', 'd e f'] ngrams = NGramSpace(1) docs = [ngrams.parse(raw) for raw in raw_docs] c = Clustering(docs) self.assertEqual(0, c.distance[0, 0]) self.assertEqual(0.5, c.distance[1, 0]) self.assertEqual(0, c.distance[1, 1]) self.assertEqual(1.0, c.distance[2, 0]) self.assertEqual(0.8, c.distance[2, 1]) self.assertEqual(0, c.distance[2, 2])
def test_nonseeded_clustering(self): ngrams = NGramSpace(1) docs = [ngrams.parse(raw) for raw in test_docs] c = Clustering(docs) self.assertEqual((1, 0), c.min_link()) c.merge(1, 0) self.assertEqual((2, 1), c.min_link()) c.merge(2, 1) self.assertTrue(c.min_link() in [(4, 3), (5, 3)]) c.merge(3, 4) c.merge(3, 5) self.assertEqual((7, 6), c.min_link())
def cluster_data(data): etl = Etl() df = etl.process_data(data) df = etl.generate_rfm(df) df = etl.normalize_df(df) clustering = Clustering() [metrics, clusters] = clustering.generate_cluster(df) headers = {'Content-type': 'application/json', 'Accept': 'text/plain'} try: requests.post(server_url + '/metrics', headers=headers, json=metrics) requests.post(server_url + '/clusters', headers=headers, json=clusters) except Exception as e: print('Error', e)
def calculate_ref_wk(self, method, k): self.wk_refs = [] for ref in range(self.refs.shape[2]): ref_clustering = Clustering(self.refs[:, :, ref], k) model, document_topic, word_topic = getattr( ref_clustering, method)() clusters = ref_clustering.document_topic.argmax(axis=1) wk_ref = self.calculate_wk(self.refs[:, :, ref], clusters) log_wk_ref = np.log(wk_ref) self.wk_refs.append(log_wk_ref) return self.wk_refs
def precompute_clustering(pre_computed, machines, dataunit_in_machine): clustering = Clustering(pre_computed) # Indexed with the clusters. This array will store the necessary G-part information for each of the clusters parts_data = [] ctr = 0 for cluster in clustering.clusters: print '%d out of %d' % (ctr, len(clustering.clusters)) ctr += 1 part_covers, dataunit_in_parts = gcpa_precompute_rt(cluster, machines, dataunit_in_machine) parts_data.append((part_covers, dataunit_in_parts)) return clustering, parts_data
def test_nearest_neighbors(self): ngrams = NGramSpace(1) docs = [ngrams.parse(raw) for raw in test_docs] c = Clustering(docs) c.pp_distance(range(0, len(test_docs))) self.assertEqual([1], c.closest_neighbors([0], 1)) self.assertEqual([1, 2], c.closest_neighbors([0], 2)) self.assertEqual([1, 2, 3], c.closest_neighbors([0], 3)) self.assertEqual([1, 2, 3, 5], c.closest_neighbors([0], 4)) self.assertEqual([5], c.closest_neighbors([3, 4], 1)) self.assertEqual([5, 1], c.closest_neighbors([3, 4], 2))
def main(fn, clusters_no): geo_locs = [] #read location data from csv file and store each location as a Point(latit,longit) object df = pd.read_csv(fn) for index, row in df.iterrows(): loc_ = Point(float(row['LAT']), float(row['LON'])) #tuples for location geo_locs.append(loc_) #run k_means clustering cluster = Clustering(geo_locs, clusters_no) flag = cluster.k_means(False) if flag == -1: print("Error in arguments!") else: #clustering results is a list of lists where each list represents one cluster print("Clustering results:") cluster.print_clusters(cluster.clusters)
def test_clustering(self): raw_docs = ['a b c', 'b c d', 'd e f'] ngrams = NGramSpace(1) docs = [ngrams.parse(raw) for raw in raw_docs] c = Clustering(docs) self.assertEqual((1, 0), c.min_link()) c.merge(1, 0) self.assertEqual([1, 1, 2], c.assignments) self.assertEqual((2, 1), c.min_link()) c.merge(2, 0) self.assertEqual([2, 2, 2], c.assignments)
def printClusters(reduced_data, algo="kmean"): #Dessin des donnes avec matplotlib clust = Clustering(reduced_data, 5) if (algo == "ga"): clust.GA(10) else: clust.kMeans() centroids, clusterAssment = clust.centroids, clust.clusterAssment cluster1X = [] cluster1Y = [] cluster2X = [] cluster2Y = [] cluster3X = [] cluster3Y = [] cluster4X = [] cluster4Y = [] cluster5X = [] cluster5Y = [] for i in range(len(reduced_data)): if (clusterAssment[i][0, 0] == 0): cluster1X.append(reduced_data[i, 0]) cluster1Y.append(reduced_data[i, 1]) if (clusterAssment[i][0, 0] == 1): cluster2X.append(reduced_data[i, 0]) cluster2Y.append(reduced_data[i, 1]) if (clusterAssment[i][0, 0] == 2): cluster3X.append(reduced_data[i, 0]) cluster3Y.append(reduced_data[i, 1]) if (clusterAssment[i][0, 0] == 3): cluster4X.append(reduced_data[i, 0]) cluster4Y.append(reduced_data[i, 1]) if (clusterAssment[i][0, 0] == 4): cluster5X.append(reduced_data[i, 0]) cluster5Y.append(reduced_data[i, 1]) plot(cluster1X, cluster1Y, 'sg') plot(cluster2X, cluster2Y, 'ob') plot(cluster3X, cluster3Y, 'or') plot(cluster4X, cluster4Y, 'mo') plot(cluster5X, cluster5Y, 'ys') show()
def filter_repeated_hits(data): cl = Clustering(10, 0.1) # cluster with default values cl.fill_clusters(data['x1'], data['x2']) toDelete = [False]*len(cl.clabels) for icl in range(cl.ncl): # loop over clusters if cl.nhits[icl]>50: counter = 0 for i in range(len(cl.clabels)): # loop over hits if cl.clabels[i]==icl: counter = counter + 1 if counter>50: toDelete[i] = True for key in data.keys(): data[key][:] = [value for value,flag in zip(data[key],toDelete) if not flag] return data
def get_model(encoder, x): clusters = 10 learning_rate = 0.01 momentum = 0.9 output = encoder.predict(x) centroids, prediction = cl.get_centroids(output, clusters) print("DEC: initial centroids found") clustering_layer = Clustering(clusters, weights=centroids, prediction=prediction, name='clustering') model = Sequential([encoder, clustering_layer]) # model.compile(loss='kullback_leibler_divergence', optimizer='adadelta') model.compile(loss=cl.calculate_kl, optimizer=SGD(lr=learning_rate, momentum=momentum)) return model
def start(): # Set up logger logger = logging.getLogger('decoder') logger.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler = logging.StreamHandler() handler.setLevel(logging.ERROR) handler.setFormatter(formatter) logger.addHandler(handler) # Set up components preprocessor = Preprocessor() clustering = Clustering(5) decoder = Decoder(logger) _input = Input() filename = _input.read_file('audio/testfile3.wav') # filename = _input.read_file('audio/testfile4.wav') preprocessor.read_csv(filename) # preprocessor.read_csv('simulation_2018-09-27_17-13-19.csv') preprocessor.plot() preprocessor.plot(True) preprocessor.process_loudness() preprocessor.plot() preprocessor.plot(True) training_batch = preprocessor.get_batch() labels = clustering.train(training_batch) mapping = clustering.get_label_mapping() signals = list() for label in labels: signals.append(mapping.get(label)) for signal in signals: decoder.decode(signal) print(decoder.message)
def clustering(x, n_clusters): """ Do the clustering, based on the 91 features. We compute the reconstructed poses only with the following default parameters: method: 'K-Medians' distance: 'angular' Args: x: array of features n_clusters: number of clusters Output: new_df: the labeled dataframe, according to the clustering algorithm relevant_features_cs: a list with the relevant features (angles of the consecutive limbs) of the centroids cs: dictionary with the centroid features """ clustering_ = Clustering(k=n_clusters) cs, cls = clustering_.fit(x) d = pd.DataFrame() l = [] for i in range(len(cs)): df1 = pd.DataFrame(cls[i]) d = pd.concat([d, df1], sort=False) l += [i] * len(cls[i]) d.columns = df.columns d.insert(91, 'label', l) new_df = df.reset_index().merge(d).set_index('index') assert len(cs) == n_clusters relevant_features_cs = [] for i in range(len(cs)): d = {} cs_rf = cs[i][relevant_features_id] for k in range(len(keys_dict)): d[keys_dict[k]] = cs_rf[k] relevant_features_cs.append(d) return new_df, relevant_features_cs, cs