def get_decomp(self, method='MDS', **kwargs): optioncheck(method, ['MDS', 'spectral']) cl = Clustering(self.dm) if method == 'MDS': return cl.MDS_decomp() if method == 'spectral': return cl.spectral_decomp(**kwargs)
def __init__(self): self.nwalkers = 32 self.ndim = 7 filename = "tutorial.h5" backend = emcee.backends.HDFBackend(filename) backend.reset(self.nwalkers, self.ndim) hod_params = {"M_min": 0, "galaxy_density": 0.00057, "boxsize": 1000, "log_halo_mass_bins": np.arange(10,15,0.1), \ "halo_histo": np.loadtxt("../data/halo_central_histo.dat")} halofile = "../../ELG_HOD_optimization/data/halo_M200b_0.54980_for_mock.dat" self.mockfactory = MockFactory(halofile, boxsize=1000, cvir_fac=1, hod_parameters=hod_params) # clustering calculator rbins = np.logspace(np.log10(0.1), np.log10(70), 21) self.cluster = Clustering(rbins) # read xi and wp from data, read cov matrix self.clustering_data = np.loadtxt("../data/clustering_data.dat") self.scaled_cov = np.loadtxt("../data/scaled_cov.dat") #with Pool(10) as pool: # self.sampler = emcee.EnsembleSampler(self.nwalkers, self.ndim, self.log_prob, backend=backend, pool = pool) # self.run() self.sampler = emcee.EnsembleSampler(self.nwalkers, self.ndim, self.log_prob, backend=backend) self.run()
def cluster_data(data, ov_min=10, ov_min1=0.2): cl = Clustering(ov_min, ov_min1) cl.fill_clusters(data['x1'], data['x2']) c_labels = cl.clabels nhits = len(c_labels) n_clust = cl.ncl x1cl = cl.cluster_x1 x2cl = cl.cluster_x2 ycl = [] pcentcl = [] namecl = [] detailcl = [] x1tcl = [] x2tcl = [] dxtcl = [] for i in range(0,n_clust): ycl.append(float(i+1)/2.) for j in range(0,nhits): if c_labels[j]==i: pcentcl.append(data['pcent'][j]) namecl.append(data['name'][j]) detailcl.append(data['detail'][j]) x1tcl.append(data['x1t'][j]) x2tcl.append(data['x2t'][j]) dxtcl.append(data['dxt'][j]) break new_data = dict(x1=x1cl, x2=x2cl, dx=[end-beg for beg,end in zip(x1cl,x2cl)], xm=[(beg+end)/2 for beg,end in zip(x1cl,x2cl)], x1t=x1tcl, x2t=x2tcl, dxt=dxtcl, y=ycl, nhits=cl.nhits, name=namecl, pcent=pcentcl, detail=detailcl) return new_data, n_clust
def main(args): #-----------------------------------------------------# # 2D/3D Convolutional Autoencoder # #-----------------------------------------------------# if args.program == 'CAE': cae = CAE(input_dir=args.data_dir, patch_size=ast.literal_eval(args.patch_size), batch_size=args.batch_size, test_size=args.test_size, prepare_batches=args.prepare_batches) cae.prepare_data(args.sampler_type, args.max_patches, args.resample, ast.literal_eval(args.patch_overlap), args.min_lab_vox, args.label_prob, args.load_data) if args.model_dir is None: cae.train(args.epochs) cae.predict(args.model_dir) #-----------------------------------------------------# # Patient classification # #-----------------------------------------------------# """ if args.program=='AutSeg': asg = AutomaticSegmentation( model_name=args.model_name, patch_size=args.patch_size, patch_overlap=args.patch_overlap, input_dir=args.data_dir, model_dir=args.model_dir ) asg.run() asg.run_postprocessing() """ if args.program == 'CLUS': clustering = Clustering(num_iters=args.iterations, num_clusters=args.num_clusters, input_dir=args.data_dir) clustering.run() if args.program == 'FeEx': fe = FeatureExtraction(model_name=args.model_name, patch_size=ast.literal_eval(args.patch_size), patch_overlap=ast.literal_eval( args.patch_overlap), num_clusters=args.num_clusters, cluster_selection=args.cluster_selection, resample=args.resample, encoded_layer_num=args.encoded_layer_num, model_dir=args.model_dir, input_dir=args.data_dir) fe.run(batch_size=20) if args.program == 'SVM': svm = SvmClassifier(feature_dir=args.feature_dir, ffr_dir=args.ffr_dir, ffr_filename=args.ffr_filename, input_dir=args.data_dir, ffr_cut_off=args.ffr_cut_off, test_size=args.test_size) svm.train() svm.predict()
def generate(self, keys, url): json_work("other_files/work_file.json", "w", []) # обнуляем work print(f'Ключей получено: {len(keys)}') if len(keys) > 0: self.generate_pretmp( keys ) # генерация претемплейтов по ключам c уникальным stemming print(f'Ключей после удаления дублей: {len(self.work_file)}') time.sleep(2) if len(self.work_file) > 0: with ThreadPoolExecutor(5) as executor: for _ in executor.map(self.template_generated, self.work_file): pass work = json_work("other_files/work_file.json", "r") if len(work) > 0: gen_data = sorted(work, key=lambda x: x["frequency"]["basic"], reverse=True) json_work("other_files/work_file.json", "w", gen_data) gen_data += json_work("other_files/main.json", "r") gen_data = sorted(gen_data, key=lambda x: x["frequency"]["basic"], reverse=True) json_work("other_files/main.json", "w", gen_data) print(f"url {url} обработан") clustering = Clustering( json_work("other_files/work_file.json", "r"), url) clustering.run() else: print("Перехожу к следующему url") return
def perform_clustering( term_ids_to_embs: Dict[int, List[float]]) -> Dict[int, Set[int]]: """Cluster the given terms into 5 clusters. Args: term_ids_to_embs: A dictionary mapping term-ids to their embeddings. Return: A dictionary of mapping each cluster label to its cluster. Each cluster is a set of term-ids. """ # Case less than 5 terms to cluster. num_terms = len(term_ids_to_embs) if num_terms < 5: clusters = {} for i, tid in enumerate(term_ids_to_embs): clusters[i] = {tid} return clusters # Case more than 5 terms to cluster. c = Clustering() term_ids_embs_items = [(k, v) for k, v in term_ids_to_embs.items()] results = c.fit([it[1] for it in term_ids_embs_items]) labels = results['labels'] print(' Density:', results['density']) clusters = defaultdict(set) for i in range(len(term_ids_embs_items)): term_id = term_ids_embs_items[i][0] label = labels[i] clusters[label].add(term_id) return clusters
def cluster(self, shapelets): """ Uses a clustering algorithm to reduce the number of shapelets. :param shapelets: list of shapelet candidates :type shapelets: np.array, shape = (len(shapelets), len(s), len(dim(s))) :return: list of remaining shapelet candidates :rtype np.array, shape = (|remaining candidates|, len(s), len(dim(s))) """ clustering = Clustering(self.d_max) clustering.fit(shapelets) return clustering.nn_centers()
def test_pairs(self): ngrams = NGramSpace(1) docs = [ngrams.parse(raw) for raw in test_docs] c = Clustering(docs) self.assertEqual((1, 0), c.closest_pair([0, 1, 2])) self.assertEqual((5, 3), c.closest_pair([3, 4, 5])) self.assertEqual((7, 6), c.closest_pair([6, 7])) self.assertEqual((2, 0), c.farthest_pair([0, 1, 2])) self.assertEqual((5, 4), c.farthest_pair([3, 4, 5])) self.assertEqual((7, 6), c.farthest_pair([6, 7]))
def clustering(x, df, n_clusters=10, distance='angular', method='K-medians'): """ Do the clustering, based on the 91 features. Args: x: array of features df: dataframe of features n_clusters: number of clusters distance: could be 'angular' or 'euclidean'; method: could be 'K-medians', 'K-means', 'Hierarchical' Output: new_df: the labeled dataframe, according to the clustering algorithm relevant_features_cs: a list with the relevant features (angles of the consecutive limbs) of the centroids cs: dictionary with the centroid features """ relevant_features_id = [ 0, 3, 5, 13, 15, 17, 25, 46, 47, 56, 64, 65, 76, 77, 83, 85, 90 ] keys_dict = [ '0-1', '0-4', '0-6', '1-2', '1-4', '1-6', '2-3', '4-5', '4-6', '5-7', '6-8', '6-9', '8-9', '8-10', '9-12', '10-11', '12-13' ] clustering_ = Clustering(k=n_clusters, distance=distance, method=method) cs, cls = clustering_.fit(x) assert len(list(cls.keys())) == n_clusters d = pd.DataFrame() l = [] for i in range(n_clusters): df1 = pd.DataFrame(cls[i]) d = pd.concat([d, df1], sort=False) l += [i] * len(cls[i]) d.columns = df.columns d.insert(91, 'label', l) new_df = df.reset_index().merge(d).set_index('index') relevant_features_cs = [] if method == 'Hierarchical': pass else: for i in range(len(cs)): d = {} cs_rf = cs[i][relevant_features_id] for k in range(len(keys_dict)): d[keys_dict[k]] = cs_rf[k] relevant_features_cs.append(d) return new_df, relevant_features_cs, cs
def cluster_data(data): etl = Etl() df = etl.process_data(data) df = etl.generate_rfm(df) df = etl.normalize_df(df) clustering = Clustering() [metrics, clusters] = clustering.generate_cluster(df) headers = {'Content-type': 'application/json', 'Accept': 'text/plain'} try: requests.post(server_url + '/metrics', headers=headers, json=metrics) requests.post(server_url + '/clusters', headers=headers, json=clusters) except Exception as e: print('Error', e)
def main(args, config): wDir = os.getcwd() #Instance Preprocessing class window = Preprocessing(args.fasta_file, config['win_length'], config['win_step']) window.output_window() print >> sys.stderr, "Creating windows_sequence.fasta" #Instance Similarity and Composition class sim = Similarity(args.fasta_file, config['score_adj'],wDir) sim_matrix = sim.mcl_perform() comp_results = Composition(config['kmer_len']) comp_matrix = comp_results.joined() #Join similarity and composition matrix for PCA join = pd.concat([comp_matrix, sim_matrix], axis= 1, join='inner') print >> sys.stderr, "Calculating similarity and composition matrix" #Instance Reduction class pca = Reduction(join, config['pca_comp']) pca_data = pca.perform_pca() print >> sys.stderr, "Performing PCA" #Instance Clustering class cluster = Clustering(pca_data) clust_obj = cluster.plot() print >> sys.stderr, "Performing clustering plot" #Instance ClusterReport class report = ClusterReport(clust_obj) file_name, querySeq = report.output_queryseq() print >> sys.stderr, "Doing report of clusters" #Instance Validate class valid = Validate(file_name, args.fasta_file,wDir) jfileComp, jfileMinus = valid.roundTwo() print >> sys.stderr, "Validation of results" #Instance ParseJplace Class parsing = ParseJplace(jfileComp, jfileMinus) corrMat = parsing.correlation() print >> sys.stderr, "Doing profiles" #Instance Profile Class ttest = Profiles(corrMat, querySeq) bestWin = ttest.windowsAssigment() print >>sys.stderr, "Doing permutations" #Instance StatsBinom finalResult = StatsBinom(args.fasta_file, config['win_length'],bestWin) finalResult.binomial() cleaning(file_name)
def test_nearest_neighbors(self): ngrams = NGramSpace(1) docs = [ngrams.parse(raw) for raw in test_docs] c = Clustering(docs) c.pp_distance(range(0, len(test_docs))) self.assertEqual([1], c.closest_neighbors([0], 1)) self.assertEqual([1, 2], c.closest_neighbors([0], 2)) self.assertEqual([1, 2, 3], c.closest_neighbors([0], 3)) self.assertEqual([1, 2, 3, 5], c.closest_neighbors([0], 4)) self.assertEqual([5], c.closest_neighbors([3, 4], 1)) self.assertEqual([5, 1], c.closest_neighbors([3, 4], 2))
def main(): #X = [[1, 1], [1, 2],[1,3], [4, 4],[4, 5], [5, 4], [5, 5], [10, 9], [10,10], [20,19], [20, 20]] X, Y = make_blobs(n_samples=5000, centers=10, cluster_std=0.60, random_state=0) cluster = Clustering(X.tolist()) cluster.buildTree(cluster.root) cluster.createLevelMatrix(cluster.root) cluster.createDistanceMatrix(numberOfCluster, numberOfLevels) query = [0, 0] start = timeit.default_timer() # Your statements here print("aug", aug_mmr(cluster, 0.5, query, X, 15)) stop = timeit.default_timer() print('Time for aug mmr: ', stop - start) start = timeit.default_timer() # Your statements here print("mmr", _mmr(0.5, query, X, 15)) stop = timeit.default_timer() print('Time for mmr: ', stop - start)
def main(fn, clusters_no): geo_locs = [] #read location data from csv file and store each location as a Point(latit,longit) object df = pd.read_csv(fn) for index, row in df.iterrows(): loc_ = Point(float(row['LAT']), float(row['LON'])) #tuples for location geo_locs.append(loc_) #run k_means clustering cluster = Clustering(geo_locs, clusters_no) flag = cluster.k_means(False) if flag == -1: print("Error in arguments!") else: #clustering results is a list of lists where each list represents one cluster print("Clustering results:") cluster.print_clusters(cluster.clusters)
def init_run(self, run_params): if self.experiment_type == self.CLASSIFICATION: return MultiClassClassification( method_name=self.method_name, dataset_name=self.dataset_name, performance_function=self.performance_function, embeddings=self.node_embeddings, **run_params, node_labels=self.node_labels, node2id_filepath=self.node2id_filepath) elif self.experiment_type == self.CLUSTERING: return Clustering(method_name=self.method_name, dataset_name=self.dataset_name, embeddings=self.node_embeddings, **run_params, node_labels=self.node_labels, performance_function=self.performance_function, node2id_filepath=self.node2id_filepath) elif self.experiment_type == self.MULTI_LABEL_CLASSIFICATION: return MultiLabelClassification( method_name=self.method_name, dataset_name=self.dataset_name, node_labels=self.node_labels, **run_params, performance_function=self.performance_function, embeddings=self.node_embeddings, node2id_filepath=self.node2id_filepath) elif self.experiment_type == self.LINK_PREDICTION: return LinkPrediction( method_name=self.method_name, dataset_name=self.dataset_name, node_embeddings=self.node_embeddings, **run_params, performance_function=self.performance_function, node2id_filepath=self.node2id_filepath)
def dump_clusters(): args = get_args() if args['-train'] == '': args['-train'] = 'src/resources/output' + args['-k'] w2vobj = W2V(args['-input'], args['-train'], args['-k']) news = News() articles = news.get_articles() w2vobj.train() # Sentence vectorization by averaging article_vecs = [w2vobj.get_sentence_vector_avg(article['cleaned_title']) for article in articles] # Sentence vectorization by "newtonian" method '''article_vecs = [] for article in articles: newtonian_vec = w2vobj.get_sentence_vector_newtonian(article['cleaned_title']) if newtonian_vec is not None: article_vecs.append(newtonian_vec)''' cluster_obj = Clustering(article_vecs, w2vobj) r_conn = redis.from_url(os.getenv('REDIS_URL',"redis://localhost:6379/")) if args['-cluster'] == 'agg': if args['-prune'] == 'true' or args['-prune'] == 'True': utilities.redis_kmeans_clusters(cluster_obj, articles, True, int(args['-limit']), r_conn) print("redis dump complete") else: utilities.redis_kmeans_clusters(cluster_obj, articles, False, int(args['-limit']), r_conn) print("redis dump complete") else: #TODO dump to redis utilties.print_ann_clusters(cluster_obj, articles)
def printClusters(reduced_data,algo="kmean"): #Dessin des donnes avec matplotlib clust = Clustering(reduced_data,5) if(algo == "ga"): clust.GA(10) else: clust.kMeans() centroids, clusterAssment = clust.centroids, clust.clusterAssment cluster1X = [] cluster1Y = [] cluster2X = [] cluster2Y = [] cluster3X = [] cluster3Y = [] cluster4X = [] cluster4Y = [] cluster5X = [] cluster5Y = [] for i in range(len(reduced_data)): if(clusterAssment[i][0,0]==0): cluster1X.append(reduced_data[i,0]) cluster1Y.append(reduced_data[i,1]) if(clusterAssment[i][0,0]==1): cluster2X.append(reduced_data[i,0]) cluster2Y.append(reduced_data[i,1]) if(clusterAssment[i][0,0]==2): cluster3X.append(reduced_data[i,0]) cluster3Y.append(reduced_data[i,1]) if(clusterAssment[i][0,0]==3): cluster4X.append(reduced_data[i,0]) cluster4Y.append(reduced_data[i,1]) if(clusterAssment[i][0,0]==4): cluster5X.append(reduced_data[i,0]) cluster5Y.append(reduced_data[i,1]) plot(cluster1X,cluster1Y,'sg') plot(cluster2X,cluster2Y,'ob') plot(cluster3X,cluster3Y,'or') plot(cluster4X,cluster4Y,'mo') plot(cluster5X,cluster5Y,'ys') show()
def filter_repeated_hits(data): cl = Clustering(10, 0.1) # cluster with default values cl.fill_clusters(data['x1'], data['x2']) toDelete = [False]*len(cl.clabels) for icl in range(cl.ncl): # loop over clusters if cl.nhits[icl]>50: counter = 0 for i in range(len(cl.clabels)): # loop over hits if cl.clabels[i]==icl: counter = counter + 1 if counter>50: toDelete[i] = True for key in data.keys(): data[key][:] = [value for value,flag in zip(data[key],toDelete) if not flag] return data
def printClusters(reduced_data, algo="kmean"): #Dessin des donnes avec matplotlib clust = Clustering(reduced_data, 5) if (algo == "ga"): clust.GA(10) else: clust.kMeans() centroids, clusterAssment = clust.centroids, clust.clusterAssment cluster1X = [] cluster1Y = [] cluster2X = [] cluster2Y = [] cluster3X = [] cluster3Y = [] cluster4X = [] cluster4Y = [] cluster5X = [] cluster5Y = [] for i in range(len(reduced_data)): if (clusterAssment[i][0, 0] == 0): cluster1X.append(reduced_data[i, 0]) cluster1Y.append(reduced_data[i, 1]) if (clusterAssment[i][0, 0] == 1): cluster2X.append(reduced_data[i, 0]) cluster2Y.append(reduced_data[i, 1]) if (clusterAssment[i][0, 0] == 2): cluster3X.append(reduced_data[i, 0]) cluster3Y.append(reduced_data[i, 1]) if (clusterAssment[i][0, 0] == 3): cluster4X.append(reduced_data[i, 0]) cluster4Y.append(reduced_data[i, 1]) if (clusterAssment[i][0, 0] == 4): cluster5X.append(reduced_data[i, 0]) cluster5Y.append(reduced_data[i, 1]) plot(cluster1X, cluster1Y, 'sg') plot(cluster2X, cluster2Y, 'ob') plot(cluster3X, cluster3Y, 'or') plot(cluster4X, cluster4Y, 'mo') plot(cluster5X, cluster5Y, 'ys') show()
def full_realtime(precompute_fraction=.4, nqueries=50000, ndataunits=100000, nmachines=50, r=3, np=.995, min_q_len=6, max_q_len=15, ctype='fast', gcpatype='better'): g = Graph.Erdos_Renyi(n=ndataunits, p=np / ndataunits) queries = [] q = 0 while q < nqueries: node = random.randint(0, ndataunits - 1) line = iterative_dfs(g, node, path=[]) if len(line) >= min_q_len: queries.append(line) q += 1 graphfile = 'n' + str( len(queries) / 1000) + 'np' + str(np) + ctype + gcpatype + 'test' with open(graphfile + '.csv', 'wb') as f: w = csv.writer(f) for line in queries: w.writerow(line) print 'Queries generated', len(queries) infile = graphfile max_to_process = min(nqueries, len(queries)) queries = queries[:max_to_process] pre_computed = queries[:int(precompute_fraction * len(queries))] machines = generate(range(ndataunits), nmachines) dataunit_in_machine = generate_hash(machines, ndataunits) clustering = Clustering(pre_computed) rt_queries = queries[len(pre_computed):] if gcpatype == 'linear': gcpa_data = GCPA(clustering, ndataunits) elif gcpatype == 'better': gcpa_data = GCPA_better(clustering, ndataunits) gcpa_data.process(machines, dataunit_in_machine) rt_covers = [] for idx, query in enumerate(rt_queries): oldlen = len(query) if (idx % 1000) == 0: print 'Query: ', idx cover, gcpa_dt = rt_query_process(query, clustering, gcpa_data, machines, dataunit_in_machine, ctype) rt_covers.append(cover) return gcpa_data.covers, rt_covers
def kga(data, k, random_state=None): rand.seed(random_state) problem = Clustering(data) centroids, _, _ = genetic(problem, k, t_pop=10, taxa_cross=0.95, taxa_mutacao=0.2) return centroids
def setup(source, pdf_path): ngrams = NGramSpace(4) print "parsing documents at %s..." % source docs = [ extract_row(row, pdf_path, ngrams) for row in csv.DictReader(open(source, 'r')) ] print "clustering %d documents..." % len(docs) clustering = Clustering([doc.parsed for doc in docs]) return (clustering, docs)
def start(): # Set up logger logger = logging.getLogger('decoder') logger.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler = logging.StreamHandler() handler.setLevel(logging.ERROR) handler.setFormatter(formatter) logger.addHandler(handler) # Set up components preprocessor = Preprocessor() clustering = Clustering(5) decoder = Decoder(logger) _input = Input() filename = _input.read_file('audio/testfile3.wav') # filename = _input.read_file('audio/testfile4.wav') preprocessor.read_csv(filename) # preprocessor.read_csv('simulation_2018-09-27_17-13-19.csv') preprocessor.plot() preprocessor.plot(True) preprocessor.process_loudness() preprocessor.plot() preprocessor.plot(True) training_batch = preprocessor.get_batch() labels = clustering.train(training_batch) mapping = clustering.get_label_mapping() signals = list() for label in labels: signals.append(mapping.get(label)) for signal in signals: decoder.decode(signal) print(decoder.message)
def do_compute(reference_txt, pre_clustering_txt, groundtruth_npy): # load reference clusters reference = Clustering.load(reference_txt) # load hypothesis clusters hypothesis = Clustering.load(pre_clustering_txt) # number of hypothesis clusters nPreClusters = len(hypothesis.clusters) preClusters = sorted(hypothesis.clusters) # groundtruth[i, j] contains # 1 if all elements in clusters i and j are in the same cluster # 0 if elements in clusters i and j are not in the same cluster # -1 if either cluster i or j is not pure groundtruth = np.empty((nPreClusters, nPreClusters), dtype=int) # clustersRef[c] contains reference cluster for pure hypothesis cluster c # in case c is not pure, clustersRef[c] is None clustersRef = {} for c in preClusters: r = set([reference[i] for i in hypothesis.clusters[c]]) if len(r) == 1: clustersRef[c] = r.pop() else: clustersRef[c] = None for k, ci in enumerate(preClusters): if clustersRef[ci] is None: groundtruth[ci, :] = -1 groundtruth[:, ci] = -1 continue for cj in preClusters[k:]: if clustersRef[cj] is not None: groundtruth[ci, cj] = clustersRef[ci] == clustersRef[cj] groundtruth[cj, ci] = groundtruth[ci, cj] # save groundtruth matrix np.save(groundtruth_npy, groundtruth)
def clustering(x, n_clusters): """ Do the clustering, based on the 91 features. We compute the reconstructed poses only with the following default parameters: method: 'K-Medians' distance: 'angular' Args: x: array of features n_clusters: number of clusters Output: new_df: the labeled dataframe, according to the clustering algorithm relevant_features_cs: a list with the relevant features (angles of the consecutive limbs) of the centroids cs: dictionary with the centroid features """ clustering_ = Clustering(k=n_clusters) cs, cls = clustering_.fit(x) d = pd.DataFrame() l = [] for i in range(len(cs)): df1 = pd.DataFrame(cls[i]) d = pd.concat([d, df1], sort=False) l += [i] * len(cls[i]) d.columns = df.columns d.insert(91, 'label', l) new_df = df.reset_index().merge(d).set_index('index') assert len(cs) == n_clusters relevant_features_cs = [] for i in range(len(cs)): d = {} cs_rf = cs[i][relevant_features_id] for k in range(len(keys_dict)): d[keys_dict[k]] = cs_rf[k] relevant_features_cs.append(d) return new_df, relevant_features_cs, cs
def get_cv_cpv(x: str, percent: float) -> float: global model_goal # Get dataset number dataset_num = get_dataset_num(x) # Get number of pcs for CPV > 0.8 and CPV > 0.99 if percent == 0.99: pcs_cpv = df_selection.loc[dataset_num, "Cum. Perc. Var. (0.99)"] else: pcs_cpv = df_selection.loc[dataset_num, "Cum. Perc. Var. (0.8)"] # Get df_results df = pd.read_csv(x) idx = df.features_kept == pcs_cpv try: return df.loc[idx].cv.values[0] except: inputs = Inputs(paths) inputs.random_seed = 1969 inputs.get_df_split(dataset_num) pca_model = get_pca_model(inputs) cluster_model = Clustering(inputs.num_cluster, 100, inputs.random_seed) cluster_model.fit(pca_model.pcs_train.loc[:, :pcs_cpv - 1]) cluster_prediction = cluster_model.predict( pca_model.pcs_test.loc[:, :pcs_cpv - 1]) cluster_performances = cluster_model.get_cluster_performances( inputs.df_test.copy(), cluster_prediction, pcs_cpv, inputs.num_cluster, model_goal=model_goal) return variation(cluster_performances)
def calculate_ref_wk(self, method, k): self.wk_refs = [] for ref in range(self.refs.shape[2]): ref_clustering = Clustering(self.refs[:, :, ref], k) model, document_topic, word_topic = getattr( ref_clustering, method)() clusters = ref_clustering.document_topic.argmax(axis=1) wk_ref = self.calculate_wk(self.refs[:, :, ref], clusters) log_wk_ref = np.log(wk_ref) self.wk_refs.append(log_wk_ref) return self.wk_refs
def test_distance(self): raw_docs = ['a b c', 'b c d', 'd e f'] ngrams = NGramSpace(1) docs = [ngrams.parse(raw) for raw in raw_docs] c = Clustering(docs) self.assertEqual(0, c.distance[0, 0]) self.assertEqual(0.5, c.distance[1, 0]) self.assertEqual(0, c.distance[1, 1]) self.assertEqual(1.0, c.distance[2, 0]) self.assertEqual(0.8, c.distance[2, 1]) self.assertEqual(0, c.distance[2, 2])
def test_clustering(self): raw_docs = ['a b c', 'b c d', 'd e f'] ngrams = NGramSpace(1) docs = [ngrams.parse(raw) for raw in raw_docs] c = Clustering(docs) self.assertEqual((1, 0), c.min_link()) c.merge(1, 0) self.assertEqual([1, 1, 2], c.assignments) self.assertEqual((2, 1), c.min_link()) c.merge(2, 0) self.assertEqual([2, 2, 2], c.assignments)
def precompute_clustering(pre_computed, machines, dataunit_in_machine): clustering = Clustering(pre_computed) # Indexed with the clusters. This array will store the necessary G-part information for each of the clusters parts_data = [] ctr = 0 for cluster in clustering.clusters: print '%d out of %d' % (ctr, len(clustering.clusters)) ctr += 1 part_covers, dataunit_in_parts = gcpa_precompute_rt(cluster, machines, dataunit_in_machine) parts_data.append((part_covers, dataunit_in_parts)) return clustering, parts_data
def clusterize(self): print("\nclusterize") self.process_clustering_data() c = Clustering(self.clustering_df) c.k_means(2) c.k_means(3) c.k_means(4)
def do_it(image_txt, features_npy, clustering_txt, output_npy): # load image list with open(image_txt, 'r') as f: images = [int(line.strip()) for line in f.readlines()] image2index = {image: index for index, image in enumerate(images)} # load hypothesis clusters clustering = Clustering.load(clustering_txt) clusters = sorted(clustering.clusters) # load features features = np.load(features_npy) # L2 normalization (for later dot product) features = (features.T / np.sqrt(np.sum((features**2), axis=1))).T # find centroid image for every cluster centroid = {} for c, cluster in enumerate(clusters): # list of images in current cluster _images = clustering.clusters[cluster] # corresponding indices in features matrix _indices = [image2index[image] for image in _images] # compute distance matrix between # all images of current cluster _features = features[_indices, :] _distance = 1. - np.dot(_features, _features.T) # find centroid image i = np.argmin(np.sum(_distance, axis=0)) centroid[cluster] = _images[i] print 'image %s is centroid of cluster %s' % (centroid[cluster], cluster) # centroid indices in features matrix _indices = [image2index[centroid[cluster]] for cluster in clusters] # compute distance matrix between all centroids _features = features[_indices, :] _distance = 1. - np.dot(_features, _features.T) # save distance matrix with open(output_npy, 'wb') as f: np.save(f, _distance)
def do_it(image_txt, features_npy, clustering_txt, output_npy): # load image list with open(image_txt, 'r') as f: images = [int(line.strip()) for line in f.readlines()] image2index = {image: index for index, image in enumerate(images)} # load hypothesis clusters clustering = Clustering.load(clustering_txt) clusters = sorted(clustering.clusters) # load features features = np.load(features_npy) # L2 normalization (for later dot product) features = (features.T / np.sqrt(np.sum((features ** 2), axis=1))).T # find centroid image for every cluster centroid = {} for c, cluster in enumerate(clusters): # list of images in current cluster _images = clustering.clusters[cluster] # corresponding indices in features matrix _indices = [image2index[image] for image in _images] # compute distance matrix between # all images of current cluster _features = features[_indices, :] _distance = 1. - np.dot(_features, _features.T) # find centroid image i = np.argmin(np.sum(_distance, axis=0)) centroid[cluster] = _images[i] print 'image %s is centroid of cluster %s' % (centroid[cluster], cluster) # centroid indices in features matrix _indices = [image2index[centroid[cluster]] for cluster in clusters] # compute distance matrix between all centroids _features = features[_indices, :] _distance = 1. - np.dot(_features, _features.T) # save distance matrix with open(output_npy, 'wb') as f: np.save(f, _distance)
def test_clustering(self): raw_docs = ["a b c", "b c d", "d e f"] ngrams = NGramSpace(1) docs = [ngrams.parse(raw) for raw in raw_docs] c = Clustering(docs) self.assertEqual((1, 0), c.min_link()) c.merge(1, 0) self.assertEqual([1, 1, 2], c.assignments) self.assertEqual((2, 1), c.min_link()) c.merge(2, 0) self.assertEqual([2, 2, 2], c.assignments)
def test_nonseeded_clustering(self): ngrams = NGramSpace(1) docs = [ngrams.parse(raw) for raw in test_docs] c = Clustering(docs) self.assertEqual((1, 0), c.min_link()) c.merge(1, 0) self.assertEqual((2, 1), c.min_link()) c.merge(2, 1) self.assertTrue(c.min_link() in [(4, 3), (5, 3)]) c.merge(3, 4) c.merge(3, 5) self.assertEqual((7, 6), c.min_link())
def detect_meteors(rf_dir, id_dir, noise_dir, output_dir, t0=None, t1=None, rxch='zenith-l', txch='tx-h', snr_thresh=1, rmin_km=70, rmax_km=140, vmin_kps=7, vmax_kps=72, eps=0.5, min_samples=5, tscale=1, rscale=1, vscale=1, debug=False, ): """Function to detect and summarize meteor head echoes. Arguments --------- rf_dir : string or list RF data directory or directories. id_dir : string ID code metadata directory. noise_dir : string RX noise metadata directory. output_dir : string Meteor data output directory. t0 : float, optional Start time, seconds since epoch. If None, start at beginning of data. t1 : float, optional End time, seconds since epoch. If None, end at end of data. rxch : string, optional Receiver channel to process. txch : string, optional Transmitter channel. """ # set up reader objects for data and metadata rfo = drf.DigitalRFReader(rf_dir) ido = drf.DigitalMetadataReader(id_dir) no = drf.DigitalMetadataReader(noise_dir) # infer time window to process based on bounds of data and metadata if t0 is None or t1 is None: bounds = [] bounds.append(rfo.get_bounds(rxch)) bounds.append(rfo.get_bounds(txch)) bounds.append(ido.get_bounds()) bounds.append(no.get_bounds()) bounds = np.asarray(bounds) ss = np.max(bounds[:, 0]) se = np.min(bounds[:, 1]) fs = rfo.get_digital_rf_metadata(rxch)['samples_per_second'] if t0 is None: s0 = ss else: s0 = int(np.uint64(t0*fs)) if t1 is None: s1 = se else: s1 = int(np.uint64(t1*fs)) # load pulse/coding information tmm = TimingModeManager.TimingModeManager() if os.path.exists('/tmp/tmm.hdf5'): tmm.loadFromHdf5('/tmp/tmm.hdf5', skip_lowlevel=True) else: tmm.loadFromHdf5(skip_lowlevel=True) # initalize generator that steps through data pulse by pulse pulse_data = data_generator(rfo, ido, no, tmm, s0, s1, rxch, txch) # initialize clustering object for grouping detections clustering = Clustering(eps, min_samples, tscale, rscale, vscale) # initialize CSV file for saving meteor clusters csvpath = os.path.join(output_dir, 'cluster_summaries.txt') csvfile = open(csvpath, "wb", 1) # 1 => use line buffering cols = mp.summarize_meteor(None) csvwriter = csv.DictWriter(csvfile, cols) csvwriter.writeheader() # loop that steps through data one pulse at a time for k, (tx, rx) in enumerate(pulse_data): # marching periods as status update if (k % 100) == 0: sys.stdout.write('.') sys.stdout.flush() # matched filter mf_rx = mp.matched_filter(tx, rx, rmin_km, rmax_km) # meteor signal detection meteors = mp.detect_meteors(mf_rx, snr_thresh, vmin_kps, vmax_kps) # clustering of detections into single meteor head echoes for meteor in meteors: sys.stdout.write('*') sys.stdout.flush() new_clusters = clustering.addnext(pulse_num=k, **meteor) for c in new_clusters: sys.stdout.write('{0}'.format(c.cluster.values[0])) # summarize head echo and save to a data file cluster_summary = mp.summarize_meteor(c, debug=debug) csvwriter.writerow(cluster_summary) # tell clustering object that data is exhausted and to return any final clusters new_clusters = clustering.finish() for c in new_clusters: # summarize head echo and save to a data file cluster_summary = mp.summarize_meteor(c) csvwriter.writerow(cluster_summary) csvfile.close()
class SequenceCollection(object): """ Orchestrating class that should: a) work as a central repository for the information generated by the subordinate classes, and b) be the only class directly interacted with by the user TO DO: implement consistent naming of methods (where appropriate) Prefixes: get_[something] - returns the object implied by something put_[something] - puts something in the class data structure show_[something] - prints something to screen plot_[something] - displays a plot of something _[something] - private method """ def __init__( self, input_dir=None, records=None, file_format="fasta", datatype="protein", helper="./class_files/DV_wrapper.drw", tmpdir="/tmp", get_distances=False, parallel_load=False, overwrite=True, ): # Unset Variables # Store some mappings for data retrieval self.records_to_keys = {} self.keys_to_records = {} self.clusters_to_partitions = {} self.partitions = {} self.distance_matrices = {} self.concats = {} self.inferred_trees = {} self.Clustering = Clustering() # Store some data self.files = None self.file_format = file_format self.datatype = datatype self.records = [] self.length = 0 self.helper = helper # Set Variables self.tmpdir = tmpdir # Lambda for sorting by name and number sort_key = lambda item: tuple((int(num) if num else alpha) for (num, alpha) in re.findall(r"(\d+)|(\D+)", item)) # Can give an input directory as optional argument # If given: # read the alignment files # optionally calculate pairwise distances # store the sequence data if input_dir: files = self.get_files(input_dir, file_format) # file checks if files == 0: print "!!!" print "There was a problem reading files from {0}".format(input_dir) print "!!!" sys.exit() if get_distances and not os.path.isfile(helper): print "!!!" print "There was a problem finding the darwin helper at {0}".format(helper) print "!!!" sys.exit() # done files.sort(key=sort_key) self.put_records(files=files, record_list=None, file_format=file_format, datatype=datatype) # takes care of self.length for us self.sanitise_records() if not os.path.isdir(tmpdir): os.mkdir(tmpdir) elif records: # Can optionally give record objects directly if no input dir specified self.put_records(files=None, record_list=records, file_format=file_format, datatype=datatype) # takes care of self.length for us self.sanitise_records() # Optionally use Darwin to calculate pairwise distances if get_distances and self.records: if parallel_load: self.put_dv_matrices_parallel(helper=helper, tmpdir=tmpdir, overwrite=overwrite) else: self.put_dv_matrices(helper=helper, tmpdir=tmpdir, overwrite=overwrite) def __str__(self): s = "SequenceCollection object:\n" s += "Contains {0} alignments\n".format(self.length) return s def __len__(self): return self.length def get_files(self, input_dir, file_format="fasta"): """ Get list of alignment files from an input directory *.fa, *.fas and *.phy files only Stores in self.files """ if file_format == "fasta": files = glob.glob("{0}/*.fa".format(input_dir)) if len(files) == 0: files = glob.glob("{0}/*.fas".format(input_dir)) elif file_format == "phylip": files = glob.glob("{0}/*.phy".format(input_dir)) else: print "Unrecognised file format %s" % file_format files = None if not files: print "No sequence files found in {0}".format(input_dir) return 0 return sorted(files) def dump_records(self, output_dir, records=None, file_format="phylip", use_hashname=True): """ Dumps all sequence alignment records to an output directory Files are dumped in sequential phylip format; by default the names are hashed """ directorycheck_and_make(output_dir) hash_translation = {} if not records: records = self.get_records() for rec in records: filename = rec._write_temp_phylip(output_dir, use_hashname=use_hashname) try: hash_translation[str(rec.name)] = filename except TypeError: print type(rec.name), rec.name, type(filename), filename cPickle.dump(hash_translation, open("{0}/hash_translation.pkl".format(output_dir), "w")) def hash(self, string): H = hashlib.sha1(string) return H.hexdigest() def gzip(self, filename): if not filename.endswith(".gz"): filename += ".gz" cPickle.dump(self, file=gz.open(filename, "wb"), protocol=-1) @classmethod def gunzip(cls, filename): return cPickle.load(gz.open(filename, "rb")) def put_records(self, files=None, record_list=None, file_format="fasta", datatype="protein"): """ Reads sequence files from the list generated by get_files and stores in self.records """ get_name = lambda i: i[i.rindex("/") + 1 : i.rindex(".")] if files and not record_list: record_list = [TCSeqRec(f, file_format=file_format, name=get_name(f), datatype=datatype) for f in files] elif not files and not record_list: print "Can't load records - no records or alignment files given" return records_to_keys = dict([(record.name, number) for (number, record) in enumerate(record_list)]) keys_to_records = dict(enumerate(record_list)) self.records = record_list self.length = len(record_list) self.records_to_keys = records_to_keys self.keys_to_records = keys_to_records def load_phyml_results(self, input_dir, records=None, use_hashname=False, program="phyml"): if not records: records = self.get_records() failures = [] for rec in records: if use_hashname: name = rec.hashname() else: name = rec.name tree_file = "{0}/{1}.phy_phyml_tree.txt".format(input_dir, name) stats_file = "{0}/{1}.phy_phyml_stats.txt".format(input_dir, name) try: rec.tree.load_phyml_results(tree_file, stats_file, name=rec.name, program=program) except FileError: failures.append(rec.name) if failures: print "Couldn't load results for the following records:" for f in failures: print " ", f def sanitise_records(self): """ Sorts records alphabetically, trims whitespace from beginning of record headers, removes '/' characters from headers, replaces spaces with underscores, puts sequences into upper case """ for rec in self.get_records(): rec.sanitise() def put_dv_matrices(self, tmpdir="/tmp", helper="./class_files/DV_wrapper.drw", overwrite=True): for rec in self.get_records(): rec.dv = [rec.get_dv_matrix(tmpdir=tmpdir, helper=helper, overwrite=overwrite)] def put_trees( self, rec_list=None, program="treecollection", model=None, datatype=None, ncat=4, optimise="n", tmpdir=None, overwrite=True, verbose=False, ): if tmpdir is None: tmpdir = self.tmpdir if not program in ["treecollection", "raxml", "phyml", "bionj"]: print "unrecognised program {0}".format(program) return if not rec_list: rec_list = self.records for rec in rec_list: if overwrite is False: if rec.name in self.inferred_trees: continue if program == "treecollection": tree = rec.get_TC_tree(tmpdir=tmpdir, overwrite=overwrite) elif program == "raxml": tree = rec.get_raxml_tree(tmpdir=tmpdir, overwrite=overwrite) elif program == "phyml": tree = rec.get_phyml_tree( model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, overwrite=overwrite, verbose=verbose ) elif program == "bionj": tree = rec.get_bionj_tree( model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, optimise=optimise, overwrite=overwrite, verbose=verbose, ) self.inferred_trees[rec.name] = tree def put_distance_matrices(self, metrics, tmpdir="/tmp", normalise=False): """ Pass this function a list of metrics valid kwargs - invert (bool), normalise (bool) """ if not isinstance(metrics, list): metrics = [metrics] trees = [rec.tree for rec in self.get_records()] for metric in metrics: dm = DistanceMatrix(trees, tmpdir=tmpdir) dm.get_distance_matrix(metric, normalise=normalise) self.distance_matrices[metric] = dm def put_partition(self, metric, cluster_method, nclusters, prune=True, tmpdir=None, recalculate=False): if not tmpdir: tmpdir = self.tmpdir if not metric in self.get_distance_matrices(): self.put_distance_matrices(metric, tmpdir=tmpdir) partition_vector = self.Clustering.run_clustering( self.distance_matrices[metric], cluster_method, nclusters, prune=prune, recalculate=recalculate ) self.clusters_to_partitions[(metric, cluster_method, nclusters)] = partition_vector self.partitions[partition_vector] = Partition(partition_vector) return partition_vector def put_partition_vector(self, partition_vector, name): """ Given a partition vector (i.e. a tuple containing the class- membership for each gene alignment), inserts the relevant data structures into the SequenceCollection object. NEXT: run concatenate_records(), put_cluster_trees() """ self.clusters_to_partitions[name] = partition_vector self.partitions[partition_vector] = Partition(partition_vector) def put_partitions(self, metrics, cluster_methods, nclusters, prune=True, tmpdir=None, recalculate=False): """ metrics, linkages and nclasses are given as lists, or coerced into lists """ if not isinstance(metrics, list): metrics = [metrics] if not isinstance(cluster_methods, list): cluster_methods = [cluster_methods] if not isinstance(nclusters, list): nclusters = [nclusters] if tmpdir is None: tmpdir = self.tmpdir else: nclusters = sorted(nclusters, reverse=True) # names = [rec.name for rec in self.get_records()] for metric in metrics: print "Clustering {0} data".format(metric) self.Clustering.clear_cache() for cluster_method in cluster_methods: print " ", cluster_method for n in nclusters: key = (metric, cluster_method, n) if key in self.clusters_to_partitions: continue else: self.put_partition( metric, cluster_method, n, prune=prune, tmpdir=tmpdir, recalculate=recalculate ) def concatenate_records(self): for p in self.partitions.values(): p.concatenate_records(self.keys_to_records) for concat in p.concats: if not concat[0].name in self.concats: self.concats[concat[0].name] = concat def autotune( self, metric, prune=True, KMeans=True, recalculate=True, tmpdir=None, max_groups=None, min_groups=2, check_single=True, ): """ Uses Perona and Zelnick-Manor's spectral rotation method to determine the number of clusters present in the data """ if not tmpdir: tmpdir = self.tmpdir if not metric in self.get_distance_matrices(): self.put_distance_matrices(metric, tmpdir=tmpdir) dm = self.get_distance_matrices()[metric] if check_single and min_groups > 1: print "Checking for single cluster..." (partition_vector, nclusters, quality_scores) = self.Clustering.run_spectral_rotate( dm, prune=prune, KMeans=KMeans, recalculate=recalculate, max_groups=6, min_groups=1, verbose=False ) if nclusters == 1: print "Single cluster found." print "Quality Scores: {0}".format(quality_scores) self.clusters_to_partitions[(metric, "rotate", nclusters)] = partition_vector self.partitions[partition_vector] = Partition(partition_vector) return (partition_vector, quality_scores) else: print ">1 clusters found." print "Quality Scores: {0}".format(quality_scores) recalculate = False (partition_vector, nclusters, quality_scores) = self.Clustering.run_spectral_rotate( dm, prune=prune, KMeans=KMeans, recalculate=recalculate, max_groups=max_groups, min_groups=min_groups ) self.clusters_to_partitions[(metric, "rotate", nclusters)] = partition_vector self.partitions[partition_vector] = Partition(partition_vector) return (partition_vector, quality_scores) def put_cluster_trees( self, program="treecollection", model=None, datatype=None, ncat=4, optimise="n", tmpdir="/tmp", overwrite=True, max_guide_trees=True, ): if program not in ["treecollection", "raxml", "phyml", "bionj"]: print "unrecognised program {0}".format(program) return if program == "treecollection": return self._put_best_TC_trees(tmpdir=tmpdir, overwrite=overwrite, max_guide_trees=max_guide_trees) rec_list = self.get_cluster_records() print "Inferring {0} cluster trees".format(len(rec_list)) self.put_trees( rec_list=rec_list, program=program, model=model, ncat=ncat, optimise=optimise, datatype=datatype, tmpdir=tmpdir, overwrite=overwrite, ) self.update_scores() def _put_best_TC_trees(self, tmpdir="/tmp", overwrite=True, max_guide_trees=-1): rec_list = self.get_cluster_records_with_memberships() for (rec, members) in rec_list: print "Calculating treecollection tree for {0}".format(rec.name), if rec.name in self.inferred_trees and overwrite == False: print "Skipping - already calculated (overwrite set to False)" continue guidetrees = [self.keys_to_records[member].tree for member in members] if max_guide_trees > 0: guidetrees = guidetrees[:max_guide_trees] TCtrees = [] pref = rec._write_temp_tc(make_guide_tree=False, tmpdir=tmpdir) pref = "{0}/{1}".format(tmpdir, pref) dv_file = pref + "_dv.txt" labels_file = pref + "_labels.txt" map_file = pref + "_map.txt" if len(guidetrees) > 1: print "(using best of {0} guidetrees)".format(len(guidetrees)) else: print "(using single guidetree)" for t in guidetrees: guidetree_file = "{0}/{1}.nwk".format(tmpdir, t.name) n = t.reroot_newick() with open(guidetree_file, "w") as writer: writer.write(n) TCtrees.append(Tree.new_treecollection_tree(dv_file, map_file, labels_file, guidetree_file, rec.name)) best = min(TCtrees, key=lambda x: x.score) rec.tree = best self.inferred_trees[rec.name] = best self.update_scores() def update_scores(self): for partition in self.partitions.values(): partition.update_score(self.concats) def _pivot(lst): new_lst = zip(*lst) return ["".join(x) for x in new_lst] def concatenate_list_of_records(self, records=None): if not records: records = self.get_records() concat = copy.deepcopy(records[0]) for rec in records[1:]: concat += rec return concat def make_randomised_copy(self, tmpdir=None, get_distances=False, parallel_load=False, overwrite=True): shuffled_records = self.get_randomised_alignments() if not tmpdir: tmpdir = self.tmpdir randomised_copy = SequenceCollection( input_dir=None, records=shuffled_records, file_format=self.file_format, datatype=self.datatype, helper=self.helper, tmpdir=tmpdir, get_distances=get_distances, parallel_load=parallel_load, overwrite=overwrite, ) return randomised_copy def show_memberships(self): partitions = self.get_partitions() for compound_key in partitions: print " ".join(str(x) for x in compound_key) partition = partitions[compound_key] print partition print self.clustering.get_memberships(partition) def simulate_from_record( self, record, output_dir, name, tmpdir, datatype=None, allow_nonsense=False, split_lengths=None, gene_names=None ): if not datatype: datatype = self.datatype if datatype == "protein": SeqSim.simulate_from_record_WAG(record, output_dir, name, tmpdir, allow_nonsense, split_lengths, gene_names) elif datatype == "dna": SeqSim.simulate_from_record_GTR(record, output_dir, name, tmpdir, allow_nonsense, split_lengths, gene_names) else: print "datatype {0} is not recognised".format(datatype) def simulate_from_result(self, key, output_dir, name, tmpdir, datatype=None, allow_nonsense=False): if not datatype: datatype = self.datatype p = self.get_partition(key) for c in p.concats: updated_record = self.concats[c.name][0] # bug: records in Partition # objects aren't linked # to trees members = c.name.split("-") lengths = [self.keys_to_records[int(x)].seqlength for x in members] names = ["sim" + self.keys_to_records[int(x)].name for x in members] self.simulate_from_record( updated_record, output_dir, name=name, tmpdir=tmpdir, allow_nonsense=allow_nonsense, split_lengths=lengths, gene_names=names, ) ####################### # Getters ####################### def get_trees(self): return [rec.tree for rec in self.get_records()] def get_cluster_records(self): """ Returns all concatenated records from cluster analysis """ sort_key = lambda item: tuple( (int(num) if num else alpha) for (num, alpha) in re.findall(r"(\d+)|(\D+)", item[0].name) ) return [rec for (rec, _) in sorted(self.concats.values(), key=sort_key)] def get_cluster_records_with_memberships(self): """ Returns all concatenated records from cluster analysis """ sort_key = lambda item: tuple( (int(num) if num else alpha) for (num, alpha) in re.findall(r"(\d+)|(\D+)", item[0].name) ) return sorted(self.concats.values(), key=sort_key) def get_cluster_trees(self): records = self.get_cluster_records() trees = [rec.tree for rec in records] return trees def get_score(self, key): return self.get_partition(key).score def get_partition(self, key): partition_vector = self.clusters_to_partitions[key] return self.partitions[partition_vector] def get_membership(self, key, flatten=False): return self.get_partition(key).get_membership(flatten=flatten) def get_partitions(self): return [(k, self.partitions[v]) for (k, v) in self.clusters_to_partitions.items()] def get_memberships(self, flatten=False): return [ (k, self.partitions[v].get_membership(flatten=flatten)) for (k, v) in self.clusters_to_partitions.items() ] def get_scores(self): return [(k, self.partitions[v].score) for (k, v) in self.clusters_to_partitions.items()] def get_randomised_alignments(self): lengths = [rec.seqlength for rec in self.get_records()] names = self.get_names() datatype = self.records[0].datatype concat = self.concatenate_list_of_records() concat.shuffle() newrecs = concat.split_by_lengths(lengths, names) return newrecs def get_records(self): """ Returns list of stored sequence records """ return [self.keys_to_records[i] for i in range(self.length)] def get_names(self): """ Returns a list of the names of the stored records """ return [rec.name for rec in self.get_records()] def get_seqlengths(self): """ Returns a list of the sequence lengths of the stored records """ return [rec.seqlength for rec in self.get_records()] def get_distance_matrices(self): return self.distance_matrices def get_dv_matrices(self): dvs = {} for rec in self.get_records(): dvs[rec.name] = rec.dv return dvs ######################### # Plotters ######################### def plot_dendrogram(self, metric, link, nclasses, show=True): plot_object = self.clustering.plot_dendrogram((metric, link, nclasses)) if show: plot_object.show() return plot_object def plot_heatmap(self, distance_matrix, partition, outfile=None): sort_partition = partition.get_membership(flatten=True) fig = distance_matrix.plot_heatmap(sort_partition=sort_partition) if outfile: fig.savefig("{0}.pdf".format(outfile)) return fig def plot_embedding( self, partition_vector, distance_matrix, embedding="MDS", prune=True, dimensions=3, centre_of_mass=False, outfile=None, standardize=False, normalise=False, annotate=False, ): """ Plots an embedding of the trees in a Principal Coordinate space, and saves as pdf. """ dm = distance_matrix.matrix partition_vector = np.array(partition_vector) labels = self.get_names() if embedding == "MDS": dbc = self.Clustering.get_double_centre(dm) (vals, vecs, var_exp) = self.Clustering.get_eigen(dbc, standardize=standardize) (coords, _) = self.Clustering.get_coords_by_dimension(vals, vecs, var_exp, 3, normalise=normalise) elif embedding == "spectral": laplacian = self.Clustering.spectral(dm, prune=prune) (vals, vecs, var_exp) = self.Clustering.get_eigen(laplacian, standardize=standardize) (coords, _) = self.Clustering.get_coords_by_dimension(vals, vecs, var_exp, 3, normalise=normalise) else: print "embedding should be one of 'MDS' or 'spectral'" print "value given was:", embedding return min_Z = min([z for (x, y, z) in coords]) P = [] # get the indices of the partition vector for each group # and store in this list max_groups = max(partition_vector) for i in range(1, max_groups + 1): partition = np.where(partition_vector == i) P.append(partition) colors = "bgrcmyk" coldict = {"b": "blue", "g": "green", "r": "red", "c": "cyan", "m": "magenta", "y": "yellow", "k": "black"} fig2d = plt.figure() fig3d = plt.figure() ax2d = fig2d.add_subplot(111) ax3d = fig3d.add_subplot(111, projection="3d") for (pos, partition) in enumerate(P): for i in partition[0]: ax2d.scatter(color=colors[pos % len(colors)], *(coords[i])[:2]) ax3d.scatter(color=colors[pos % len(colors)], *coords[i]) ax3d.plot( [coords[i][0], coords[i][0]], [coords[i][1], coords[i][1]], [min_Z, coords[i][2]], color="grey", linewidth=0.2, ) if annotate: ax2d.annotate( labels[i], xy=(coords[i][0], coords[i][1]), xytext=(-20, 20), textcoords="offset points", fontsize="x-small", ha="right", va="bottom", bbox=dict(boxstyle="round,pad=0.5", fc="yellow", alpha=0.5), arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=0"), ) if centre_of_mass: com = np.mean(coords[partition], axis=0) ax2d.scatter(color="k", marker="x", s=2, *com[:2]) ax3d.scatter(color="k", marker="x", s=2, *com) if embedding == "spectral" and normalise: (u, v) = np.mgrid[0 : 2 * np.pi : 20j, 0 : np.pi : 10j] x = np.cos(u) * np.sin(v) y = np.sin(u) * np.sin(v) z = np.cos(v) ax3d.plot_wireframe(x, y, z, color="grey", linewidth=0.2) ax2d.set_xlabel("PCo1") ax2d.set_ylabel("PCo2") ax2d.set_title("Trees embedded in dimension-reduced space") ax3d.set_xlabel("PCo1") ax3d.set_ylabel("PCo2") ax3d.set_zlabel("PCo3") ax3d.set_title("Trees embedded in dimension-reduced space") if outfile: fig2d.savefig("{0}-2d.pdf".format(outfile)) fig3d.savefig("{0}-3d.pdf".format(outfile)) return (fig2d, fig3d) ######################### # Parallelisers ######################### def _unpack_dv(self, packed_args): return packed_args[0].get_dv_matrix(*packed_args[1:]) def _dv_parallel_call(self, tmpdir="/tmp", helper="./class_files/DV_wrapper.drw", overwrite=True): nprocesses = min(self.length, multiprocessing.cpu_count() - 1) print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, self.length) pool = multiprocessing.Pool(nprocesses) results = [] args = [] names = [] for rec in self.get_records(): new_dir = tmpdir + "/" + rec.name if not os.path.isdir(new_dir): os.mkdir(new_dir) args.append((rec, tmpdir + "/" + rec.name, helper, overwrite)) names.append(rec.name) r = pool.map_async(self._unpack_dv, args, callback=results.append) r.wait() for (w, x, y, z) in args: if os.path.isdir(x): os.rmdir(x) results = results[0] print "Results obtained, closing pool..." pool.close() pool.join() print "Pool closed" return dict(zip(names, results)) def put_dv_matrices_parallel(self, tmpdir="/tmp", helper="./class_files/DV_wrapper.drw", overwrite=True): dv_matrices_dict = self._dv_parallel_call(tmpdir, helper, overwrite=overwrite) for rec in self.get_records(): rec.dv = [dv_matrices_dict[rec.name]] def _unpack_bionj(self, packed_args): return packed_args[0].get_bionj_tree(*packed_args[1:]) def _bionj_parallel_call(self, model=None, datatype=None, rec_list=None, ncat=1, tmpdir="/tmp", overwrite=True): if not rec_list: rec_list = self.records nprocesses = min(len(rec_list), multiprocessing.cpu_count() - 1) print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, len(rec_list)) pool = multiprocessing.Pool(nprocesses) results = [] args = [] names = [] for rec in rec_list: args.append((rec, model, datatype, ncat, tmpdir, overwrite)) names.append(rec.name) r = pool.map_async(self._unpack_bionj, args, callback=results.append) r.wait() print "Results obtained, closing pool..." pool.close() pool.join() print "Pool closed" return dict(zip(names, results[0])) def _unpack_phyml(self, packed_args): return packed_args[0].get_phyml_tree(*packed_args[1:]) def _phyml_parallel_call(self, model=None, datatype=None, rec_list=None, ncat=4, tmpdir="/tmp", overwrite=True): if not rec_list: rec_list = self.records nprocesses = min(len(rec_list), multiprocessing.cpu_count() - 1) print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, len(rec_list)) pool = multiprocessing.Pool(nprocesses) results = [] args = [] names = [] for rec in rec_list: args.append((rec, model, datatype, ncat, tmpdir, overwrite)) names.append(rec.name) r = pool.map_async(self._unpack_phyml, args, callback=results.append) r.wait() print "Results obtained, closing pool..." pool.close() pool.join() print "Pool closed" return dict(zip(names, results[0])) def _unpack_raxml(self, packed_args): return packed_args[0].get_raxml_tree(*packed_args[1:]) def _raxml_parallel_call(self, rec_list=None, tmpdir="/tmp", overwrite=True): if not rec_list: rec_list = self.records nprocesses = multiprocessing.cpu_count() - 1 print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, len(rec_list)) pool = multiprocessing.Pool(nprocesses) results = [] args = [] names = [] for rec in rec_list: args.append((rec, tmpdir, overwrite)) names.append(rec.name) r = pool.map_async(self._unpack_raxml, args, callback=results.append) r.wait() pool.close() pool.join() return dict(zip(names, results[0])) def _unpack_TC(self, packed_args): return packed_args[0].get_TC_tree(*packed_args[1:]) def _TC_parallel_call(self, rec_list=None, tmpdir="/tmp", overwrite=True): if not rec_list: rec_list = self.records nprocesses = multiprocessing.cpu_count() - 1 print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, len(rec_list)) pool = multiprocessing.Pool(nprocesses) results = [] args = [] names = [] for rec in rec_list: args.append((rec, tmpdir, overwrite)) names.append(rec.name) r = pool.map_async(self._unpack_TC, args, callback=results.append) r.wait() pool.close() pool.join() return dict(zip(names, results[0])) def put_trees_parallel( self, rec_list=None, program="treecollection", model=None, datatype=None, ncat=4, tmpdir="/tmp", overwrite=True ): if not program in ["treecollection", "raxml", "phyml", "bionj"]: print "unrecognised program {0}".format(program) return if not rec_list: rec_list = self.records if program == "treecollection": trees_dict = self._TC_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite) elif program == "raxml": trees_dict = self._raxml_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite) elif program == "phyml": trees_dict = self._phyml_parallel_call( rec_list=rec_list, model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, overwrite=overwrite ) elif program == "bionj": trees_dict = self._bionj_parallel_call( rec_list=rec_list, model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, overwrite=overwrite ) for rec in self.get_records(): rec.tree = trees_dict[rec.name] self.inferred_trees[rec.name] = trees_dict[rec.name] def put_cluster_trees_parallel( self, program="treecollection", model=None, datatype=None, ncat=4, tmpdir="/tmp", overwrite=True ): if program not in ["treecollection", "raxml", "phyml", "bionj"]: print "unrecognised program {0}".format(program) return rec_list = self.get_cluster_records() print "Inferring {0} cluster trees".format(len(rec_list)) if program == "treecollection": cluster_trees_dict = self._TC_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite) elif program == "raxml": cluster_trees_dict = self._raxml_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite) elif program == "phyml": cluster_trees_dict = self._phyml_parallel_call( rec_list=rec_list, model=model, datatype=datatype, ncat=ncat, tmpdir=tmpdir, overwrite=overwrite ) elif program == "bionj": cluster_trees_dict = self._bionj_parallel_call( rec_list=rec_list, model=model, datatype=datatype, ncat=ncat, tmpdir=tmpdir, overwrite=overwrite ) for rec in rec_list: rec.tree = cluster_trees_dict[rec.name] self.update_results()
file = open(filename, "w") for k, v in cluster.iteritems(): for photo in v: file.write("%d\t%d\n" % (photo, k)) file.close() print "Loading json into memory..." dictionary = readjson("/vol/corpora4/mediaeval/2014/SED_2014_Dev_Metadata.json") print "...Done !" clusterU = clusterUser(dictionary, fileID) clusterD = clusterDate(dictionary, fileID, clusterU) print_result_file(clusterD, fileOUT) reference = Clustering.load(fileREF) hypothesis = Clustering.load(fileOUT) images = [] for c in clusterD.values(): for i in range (0, len(c)): images.append(c[i]) h = homogeneity(reference, hypothesis, images) print h c = completeness(reference, hypothesis, images) print c
def __init__( self, input_dir=None, records=None, file_format="fasta", datatype="protein", helper="./class_files/DV_wrapper.drw", tmpdir="/tmp", get_distances=False, parallel_load=False, overwrite=True, ): # Unset Variables # Store some mappings for data retrieval self.records_to_keys = {} self.keys_to_records = {} self.clusters_to_partitions = {} self.partitions = {} self.distance_matrices = {} self.concats = {} self.inferred_trees = {} self.Clustering = Clustering() # Store some data self.files = None self.file_format = file_format self.datatype = datatype self.records = [] self.length = 0 self.helper = helper # Set Variables self.tmpdir = tmpdir # Lambda for sorting by name and number sort_key = lambda item: tuple((int(num) if num else alpha) for (num, alpha) in re.findall(r"(\d+)|(\D+)", item)) # Can give an input directory as optional argument # If given: # read the alignment files # optionally calculate pairwise distances # store the sequence data if input_dir: files = self.get_files(input_dir, file_format) # file checks if files == 0: print "!!!" print "There was a problem reading files from {0}".format(input_dir) print "!!!" sys.exit() if get_distances and not os.path.isfile(helper): print "!!!" print "There was a problem finding the darwin helper at {0}".format(helper) print "!!!" sys.exit() # done files.sort(key=sort_key) self.put_records(files=files, record_list=None, file_format=file_format, datatype=datatype) # takes care of self.length for us self.sanitise_records() if not os.path.isdir(tmpdir): os.mkdir(tmpdir) elif records: # Can optionally give record objects directly if no input dir specified self.put_records(files=None, record_list=records, file_format=file_format, datatype=datatype) # takes care of self.length for us self.sanitise_records() # Optionally use Darwin to calculate pairwise distances if get_distances and self.records: if parallel_load: self.put_dv_matrices_parallel(helper=helper, tmpdir=tmpdir, overwrite=overwrite) else: self.put_dv_matrices(helper=helper, tmpdir=tmpdir, overwrite=overwrite)