示例#1
0
def cluster_data(data, ov_min=10, ov_min1=0.2):
    cl = Clustering(ov_min, ov_min1)
    cl.fill_clusters(data['x1'], data['x2'])

    c_labels = cl.clabels
    nhits = len(c_labels)
    n_clust = cl.ncl
    x1cl = cl.cluster_x1
    x2cl = cl.cluster_x2
    ycl = []
    pcentcl = []
    namecl = []
    detailcl = []
    x1tcl = []
    x2tcl = []
    dxtcl = []
    for i in range(0,n_clust):
        ycl.append(float(i+1)/2.)
        for j in range(0,nhits):
            if c_labels[j]==i:
                pcentcl.append(data['pcent'][j])
                namecl.append(data['name'][j])
                detailcl.append(data['detail'][j])
                x1tcl.append(data['x1t'][j])
                x2tcl.append(data['x2t'][j])
                dxtcl.append(data['dxt'][j])
                break

    new_data = dict(x1=x1cl, x2=x2cl, dx=[end-beg for beg,end in zip(x1cl,x2cl)],
                    xm=[(beg+end)/2 for beg,end in zip(x1cl,x2cl)],
                    x1t=x1tcl, x2t=x2tcl, dxt=dxtcl,
                    y=ycl, nhits=cl.nhits, name=namecl, pcent=pcentcl, detail=detailcl)

    return new_data, n_clust
示例#2
0
    def generate(self, keys, url):
        json_work("other_files/work_file.json", "w", [])  # обнуляем work

        print(f'Ключей получено: {len(keys)}')

        if len(keys) > 0:
            self.generate_pretmp(
                keys
            )  # генерация претемплейтов по ключам c уникальным stemming
            print(f'Ключей после удаления дублей: {len(self.work_file)}')
            time.sleep(2)
            if len(self.work_file) > 0:
                with ThreadPoolExecutor(5) as executor:
                    for _ in executor.map(self.template_generated,
                                          self.work_file):
                        pass
                work = json_work("other_files/work_file.json", "r")
                if len(work) > 0:
                    gen_data = sorted(work,
                                      key=lambda x: x["frequency"]["basic"],
                                      reverse=True)
                    json_work("other_files/work_file.json", "w", gen_data)
                    gen_data += json_work("other_files/main.json", "r")
                    gen_data = sorted(gen_data,
                                      key=lambda x: x["frequency"]["basic"],
                                      reverse=True)
                    json_work("other_files/main.json", "w", gen_data)
                    print(f"url {url} обработан")
                    clustering = Clustering(
                        json_work("other_files/work_file.json", "r"), url)
                    clustering.run()
            else:
                print("Перехожу к следующему url")
        return
示例#3
0
 def get_decomp(self, method='MDS', **kwargs):
     optioncheck(method, ['MDS', 'spectral'])
     cl = Clustering(self.dm)
     if method == 'MDS':
         return cl.MDS_decomp()
     if method == 'spectral':
         return cl.spectral_decomp(**kwargs)
示例#4
0
    def __init__(self):
        self.nwalkers = 32
        self.ndim = 7
        filename = "tutorial.h5"
        backend = emcee.backends.HDFBackend(filename)
        backend.reset(self.nwalkers, self.ndim)

        hod_params = {"M_min": 0, "galaxy_density": 0.00057, "boxsize": 1000, "log_halo_mass_bins": np.arange(10,15,0.1), \
          "halo_histo": np.loadtxt("../data/halo_central_histo.dat")}
        halofile = "../../ELG_HOD_optimization/data/halo_M200b_0.54980_for_mock.dat"
        self.mockfactory = MockFactory(halofile,
                                       boxsize=1000,
                                       cvir_fac=1,
                                       hod_parameters=hod_params)

        # clustering calculator
        rbins = np.logspace(np.log10(0.1), np.log10(70), 21)
        self.cluster = Clustering(rbins)

        # read xi and wp from data, read cov matrix
        self.clustering_data = np.loadtxt("../data/clustering_data.dat")
        self.scaled_cov = np.loadtxt("../data/scaled_cov.dat")

        #with Pool(10) as pool:
        #	self.sampler = emcee.EnsembleSampler(self.nwalkers, self.ndim, self.log_prob, backend=backend, pool = pool)
        #	self.run()

        self.sampler = emcee.EnsembleSampler(self.nwalkers,
                                             self.ndim,
                                             self.log_prob,
                                             backend=backend)
        self.run()
示例#5
0
def perform_clustering(
        term_ids_to_embs: Dict[int, List[float]]) -> Dict[int, Set[int]]:
    """Cluster the given terms into 5 clusters.

    Args:
        term_ids_to_embs: A dictionary mapping term-ids to their
            embeddings.
    Return:
        A dictionary of mapping each cluster label to its cluster.
        Each cluster is a set of term-ids.
    """
    # Case less than 5 terms to cluster.
    num_terms = len(term_ids_to_embs)
    if num_terms < 5:
        clusters = {}
        for i, tid in enumerate(term_ids_to_embs):
            clusters[i] = {tid}
        return clusters

    # Case more than 5 terms to cluster.
    c = Clustering()
    term_ids_embs_items = [(k, v) for k, v in term_ids_to_embs.items()]
    results = c.fit([it[1] for it in term_ids_embs_items])
    labels = results['labels']
    print('  Density:', results['density'])
    clusters = defaultdict(set)
    for i in range(len(term_ids_embs_items)):
        term_id = term_ids_embs_items[i][0]
        label = labels[i]
        clusters[label].add(term_id)
    return clusters
示例#6
0
def main(args):
    #-----------------------------------------------------#
    #             2D/3D Convolutional Autoencoder         #
    #-----------------------------------------------------#
    if args.program == 'CAE':
        cae = CAE(input_dir=args.data_dir,
                  patch_size=ast.literal_eval(args.patch_size),
                  batch_size=args.batch_size,
                  test_size=args.test_size,
                  prepare_batches=args.prepare_batches)

        cae.prepare_data(args.sampler_type, args.max_patches, args.resample,
                         ast.literal_eval(args.patch_overlap),
                         args.min_lab_vox, args.label_prob, args.load_data)
        if args.model_dir is None:
            cae.train(args.epochs)
        cae.predict(args.model_dir)

    #-----------------------------------------------------#
    #               Patient classification                #
    #-----------------------------------------------------#
    """
    if args.program=='AutSeg':
        asg = AutomaticSegmentation(    model_name=args.model_name,
                                        patch_size=args.patch_size,
                                        patch_overlap=args.patch_overlap,
                                        input_dir=args.data_dir, 
                                        model_dir=args.model_dir   )
        asg.run()
        asg.run_postprocessing()

"""
    if args.program == 'CLUS':
        clustering = Clustering(num_iters=args.iterations,
                                num_clusters=args.num_clusters,
                                input_dir=args.data_dir)
        clustering.run()

    if args.program == 'FeEx':
        fe = FeatureExtraction(model_name=args.model_name,
                               patch_size=ast.literal_eval(args.patch_size),
                               patch_overlap=ast.literal_eval(
                                   args.patch_overlap),
                               num_clusters=args.num_clusters,
                               cluster_selection=args.cluster_selection,
                               resample=args.resample,
                               encoded_layer_num=args.encoded_layer_num,
                               model_dir=args.model_dir,
                               input_dir=args.data_dir)
        fe.run(batch_size=20)

    if args.program == 'SVM':
        svm = SvmClassifier(feature_dir=args.feature_dir,
                            ffr_dir=args.ffr_dir,
                            ffr_filename=args.ffr_filename,
                            input_dir=args.data_dir,
                            ffr_cut_off=args.ffr_cut_off,
                            test_size=args.test_size)
        svm.train()
        svm.predict()
示例#7
0
def get_cv_cpv(x: str, percent: float) -> float:
    global model_goal
    # Get dataset number
    dataset_num = get_dataset_num(x)

    # Get number of pcs for CPV > 0.8 and CPV > 0.99
    if percent == 0.99:
        pcs_cpv = df_selection.loc[dataset_num, "Cum. Perc. Var. (0.99)"]
    else:
        pcs_cpv = df_selection.loc[dataset_num, "Cum. Perc. Var. (0.8)"]

    # Get df_results
    df = pd.read_csv(x)
    idx = df.features_kept == pcs_cpv
    try:
        return df.loc[idx].cv.values[0]
    except:
        inputs = Inputs(paths)
        inputs.random_seed = 1969
        inputs.get_df_split(dataset_num)

        pca_model = get_pca_model(inputs)

        cluster_model = Clustering(inputs.num_cluster, 100, inputs.random_seed)
        cluster_model.fit(pca_model.pcs_train.loc[:, :pcs_cpv - 1])
        cluster_prediction = cluster_model.predict(
            pca_model.pcs_test.loc[:, :pcs_cpv - 1])
        cluster_performances = cluster_model.get_cluster_performances(
            inputs.df_test.copy(),
            cluster_prediction,
            pcs_cpv,
            inputs.num_cluster,
            model_goal=model_goal)
        return variation(cluster_performances)
示例#8
0
def dump_clusters():

    args = get_args()
    if args['-train'] == '':
        args['-train'] = 'src/resources/output' + args['-k']
    w2vobj = W2V(args['-input'], args['-train'], args['-k'])

    news = News()
    articles = news.get_articles()
    w2vobj.train()
    # Sentence vectorization by averaging
    article_vecs = [w2vobj.get_sentence_vector_avg(article['cleaned_title']) for article in articles]

    # Sentence vectorization by "newtonian" method
    '''article_vecs = []
    for article in articles:
        newtonian_vec = w2vobj.get_sentence_vector_newtonian(article['cleaned_title'])
        if newtonian_vec is not None:
            article_vecs.append(newtonian_vec)'''

    cluster_obj = Clustering(article_vecs, w2vobj)
    r_conn = redis.from_url(os.getenv('REDIS_URL',"redis://localhost:6379/"))

    if args['-cluster'] == 'agg':
        if args['-prune'] == 'true' or args['-prune'] == 'True':
            utilities.redis_kmeans_clusters(cluster_obj, articles, True, int(args['-limit']), r_conn)
            print("redis dump complete")
        else:
            utilities.redis_kmeans_clusters(cluster_obj, articles, False, int(args['-limit']), r_conn)
            print("redis dump complete")
    else:
        #TODO dump to redis
        utilties.print_ann_clusters(cluster_obj, articles)
 def init_run(self, run_params):
     if self.experiment_type == self.CLASSIFICATION:
         return MultiClassClassification(
             method_name=self.method_name,
             dataset_name=self.dataset_name,
             performance_function=self.performance_function,
             embeddings=self.node_embeddings,
             **run_params,
             node_labels=self.node_labels,
             node2id_filepath=self.node2id_filepath)
     elif self.experiment_type == self.CLUSTERING:
         return Clustering(method_name=self.method_name,
                           dataset_name=self.dataset_name,
                           embeddings=self.node_embeddings,
                           **run_params,
                           node_labels=self.node_labels,
                           performance_function=self.performance_function,
                           node2id_filepath=self.node2id_filepath)
     elif self.experiment_type == self.MULTI_LABEL_CLASSIFICATION:
         return MultiLabelClassification(
             method_name=self.method_name,
             dataset_name=self.dataset_name,
             node_labels=self.node_labels,
             **run_params,
             performance_function=self.performance_function,
             embeddings=self.node_embeddings,
             node2id_filepath=self.node2id_filepath)
     elif self.experiment_type == self.LINK_PREDICTION:
         return LinkPrediction(
             method_name=self.method_name,
             dataset_name=self.dataset_name,
             node_embeddings=self.node_embeddings,
             **run_params,
             performance_function=self.performance_function,
             node2id_filepath=self.node2id_filepath)
示例#10
0
def main():
    #X = [[1, 1], [1, 2],[1,3], [4, 4],[4, 5], [5, 4], [5, 5], [10, 9], [10,10], [20,19], [20, 20]]
    X, Y = make_blobs(n_samples=5000,
                      centers=10,
                      cluster_std=0.60,
                      random_state=0)
    cluster = Clustering(X.tolist())
    cluster.buildTree(cluster.root)

    cluster.createLevelMatrix(cluster.root)
    cluster.createDistanceMatrix(numberOfCluster, numberOfLevels)

    query = [0, 0]

    start = timeit.default_timer()
    # Your statements here
    print("aug", aug_mmr(cluster, 0.5, query, X, 15))
    stop = timeit.default_timer()
    print('Time for aug mmr: ', stop - start)

    start = timeit.default_timer()

    # Your statements here
    print("mmr", _mmr(0.5, query, X, 15))

    stop = timeit.default_timer()

    print('Time for mmr: ', stop - start)
示例#11
0
def full_realtime(precompute_fraction=.4,
                  nqueries=50000,
                  ndataunits=100000,
                  nmachines=50,
                  r=3,
                  np=.995,
                  min_q_len=6,
                  max_q_len=15,
                  ctype='fast',
                  gcpatype='better'):
    g = Graph.Erdos_Renyi(n=ndataunits, p=np / ndataunits)
    queries = []
    q = 0
    while q < nqueries:
        node = random.randint(0, ndataunits - 1)
        line = iterative_dfs(g, node, path=[])
        if len(line) >= min_q_len:
            queries.append(line)
            q += 1

    graphfile = 'n' + str(
        len(queries) / 1000) + 'np' + str(np) + ctype + gcpatype + 'test'
    with open(graphfile + '.csv', 'wb') as f:
        w = csv.writer(f)
        for line in queries:
            w.writerow(line)

    print 'Queries generated', len(queries)
    infile = graphfile
    max_to_process = min(nqueries, len(queries))
    queries = queries[:max_to_process]

    pre_computed = queries[:int(precompute_fraction * len(queries))]
    machines = generate(range(ndataunits), nmachines)
    dataunit_in_machine = generate_hash(machines, ndataunits)
    clustering = Clustering(pre_computed)

    rt_queries = queries[len(pre_computed):]

    if gcpatype == 'linear':
        gcpa_data = GCPA(clustering, ndataunits)
    elif gcpatype == 'better':
        gcpa_data = GCPA_better(clustering, ndataunits)

    gcpa_data.process(machines, dataunit_in_machine)

    rt_covers = []

    for idx, query in enumerate(rt_queries):
        oldlen = len(query)
        if (idx % 1000) == 0:
            print 'Query: ', idx
        cover, gcpa_dt = rt_query_process(query, clustering, gcpa_data,
                                          machines, dataunit_in_machine, ctype)

        rt_covers.append(cover)

    return gcpa_data.covers, rt_covers
示例#12
0
def kga(data, k, random_state=None):
    rand.seed(random_state)
    problem = Clustering(data)
    centroids, _, _ = genetic(problem,
                              k,
                              t_pop=10,
                              taxa_cross=0.95,
                              taxa_mutacao=0.2)
    return centroids
示例#13
0
    def clusterize(self):
        print("\nclusterize")

        self.process_clustering_data()

        c = Clustering(self.clustering_df)
        c.k_means(2)
        c.k_means(3)
        c.k_means(4)
示例#14
0
def setup(source, pdf_path):
    ngrams = NGramSpace(4)
    print "parsing documents at %s..." % source
    docs = [
        extract_row(row, pdf_path, ngrams)
        for row in csv.DictReader(open(source, 'r'))
    ]
    print "clustering %d documents..." % len(docs)
    clustering = Clustering([doc.parsed for doc in docs])
    return (clustering, docs)
示例#15
0
 def cluster(self, shapelets):
     """
     Uses a clustering algorithm to reduce the number of shapelets.
     :param shapelets: list of shapelet candidates
     :type shapelets: np.array, shape = (len(shapelets), len(s), len(dim(s)))
     :return: list of remaining shapelet candidates
     :rtype np.array, shape = (|remaining candidates|, len(s), len(dim(s)))
     """
     clustering = Clustering(self.d_max)
     clustering.fit(shapelets)
     return clustering.nn_centers()
示例#16
0
    def test_pairs(self):
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in test_docs]
        c = Clustering(docs)

        self.assertEqual((1, 0), c.closest_pair([0, 1, 2]))
        self.assertEqual((5, 3), c.closest_pair([3, 4, 5]))
        self.assertEqual((7, 6), c.closest_pair([6, 7]))

        self.assertEqual((2, 0), c.farthest_pair([0, 1, 2]))
        self.assertEqual((5, 4), c.farthest_pair([3, 4, 5]))
        self.assertEqual((7, 6), c.farthest_pair([6, 7]))
示例#17
0
def clustering(x, df, n_clusters=10, distance='angular', method='K-medians'):
    """
  Do the clustering, based on the 91 features.
  Args:
	  x: array of features
	  df: dataframe of features
	  n_clusters: number of clusters
	  distance: could be 'angular' or 'euclidean';
      method: could be 'K-medians', 'K-means', 'Hierarchical'
  Output:
	  new_df: the labeled dataframe, according to the clustering algorithm
	  relevant_features_cs: a list with the relevant features (angles of the consecutive limbs) of the centroids
	  cs: dictionary with the centroid features 
  """

    relevant_features_id = [
        0, 3, 5, 13, 15, 17, 25, 46, 47, 56, 64, 65, 76, 77, 83, 85, 90
    ]
    keys_dict = [
        '0-1', '0-4', '0-6', '1-2', '1-4', '1-6', '2-3', '4-5', '4-6', '5-7',
        '6-8', '6-9', '8-9', '8-10', '9-12', '10-11', '12-13'
    ]

    clustering_ = Clustering(k=n_clusters, distance=distance, method=method)
    cs, cls = clustering_.fit(x)

    assert len(list(cls.keys())) == n_clusters

    d = pd.DataFrame()
    l = []
    for i in range(n_clusters):
        df1 = pd.DataFrame(cls[i])
        d = pd.concat([d, df1], sort=False)
        l += [i] * len(cls[i])

    d.columns = df.columns
    d.insert(91, 'label', l)

    new_df = df.reset_index().merge(d).set_index('index')

    relevant_features_cs = []
    if method == 'Hierarchical':
        pass
    else:
        for i in range(len(cs)):
            d = {}
            cs_rf = cs[i][relevant_features_id]
            for k in range(len(keys_dict)):
                d[keys_dict[k]] = cs_rf[k]
            relevant_features_cs.append(d)

    return new_df, relevant_features_cs, cs
示例#18
0
    def test_distance(self):
        raw_docs = ['a b c', 'b c d', 'd e f']
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in raw_docs]

        c = Clustering(docs)

        self.assertEqual(0, c.distance[0, 0])
        self.assertEqual(0.5, c.distance[1, 0])
        self.assertEqual(0, c.distance[1, 1])
        self.assertEqual(1.0, c.distance[2, 0])
        self.assertEqual(0.8, c.distance[2, 1])
        self.assertEqual(0, c.distance[2, 2])
示例#19
0
    def test_nonseeded_clustering(self):
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in test_docs]
        c = Clustering(docs)

        self.assertEqual((1, 0), c.min_link())
        c.merge(1, 0)
        self.assertEqual((2, 1), c.min_link())
        c.merge(2, 1)
        self.assertTrue(c.min_link() in [(4, 3), (5, 3)])
        c.merge(3, 4)
        c.merge(3, 5)
        self.assertEqual((7, 6), c.min_link())
示例#20
0
def cluster_data(data):
    etl = Etl()
    df = etl.process_data(data)
    df = etl.generate_rfm(df)
    df = etl.normalize_df(df)
    clustering = Clustering()
    [metrics, clusters] = clustering.generate_cluster(df)
    headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
    try:
        requests.post(server_url + '/metrics', headers=headers, json=metrics)
        requests.post(server_url + '/clusters', headers=headers, json=clusters)
    except Exception as e:
        print('Error', e)
示例#21
0
    def calculate_ref_wk(self, method, k):
        self.wk_refs = []

        for ref in range(self.refs.shape[2]):
            ref_clustering = Clustering(self.refs[:, :, ref], k)
            model, document_topic, word_topic = getattr(
                ref_clustering, method)()
            clusters = ref_clustering.document_topic.argmax(axis=1)
            wk_ref = self.calculate_wk(self.refs[:, :, ref], clusters)
            log_wk_ref = np.log(wk_ref)
            self.wk_refs.append(log_wk_ref)

        return self.wk_refs
示例#22
0
def precompute_clustering(pre_computed, machines, dataunit_in_machine):

    clustering = Clustering(pre_computed)
    # Indexed with the clusters. This array will store the necessary G-part information for each of the clusters
    parts_data = []
    ctr = 0

    for cluster in clustering.clusters: 
        print '%d out of %d'  % (ctr, len(clustering.clusters))
        ctr += 1
        part_covers, dataunit_in_parts = gcpa_precompute_rt(cluster, machines, dataunit_in_machine)
        parts_data.append((part_covers, dataunit_in_parts))

    return clustering, parts_data
示例#23
0
    def test_nearest_neighbors(self):
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in test_docs]
        c = Clustering(docs)

        c.pp_distance(range(0, len(test_docs)))

        self.assertEqual([1], c.closest_neighbors([0], 1))
        self.assertEqual([1, 2], c.closest_neighbors([0], 2))
        self.assertEqual([1, 2, 3], c.closest_neighbors([0], 3))
        self.assertEqual([1, 2, 3, 5], c.closest_neighbors([0], 4))

        self.assertEqual([5], c.closest_neighbors([3, 4], 1))
        self.assertEqual([5, 1], c.closest_neighbors([3, 4], 2))
示例#24
0
def main(fn, clusters_no):
    geo_locs = []
    #read location data from csv file and store each location as a Point(latit,longit) object
    df = pd.read_csv(fn)
    for index, row in df.iterrows():
        loc_ = Point(float(row['LAT']), float(row['LON']))  #tuples for location
        geo_locs.append(loc_)
    #run k_means clustering
    cluster = Clustering(geo_locs, clusters_no)
    flag = cluster.k_means(False)
    if flag == -1:
        print("Error in arguments!")
    else:
        #clustering results is a list of lists where each list represents one cluster
        print("Clustering results:")
        cluster.print_clusters(cluster.clusters)
示例#25
0
    def test_clustering(self):
        raw_docs = ['a b c', 'b c d', 'd e f']
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in raw_docs]

        c = Clustering(docs)

        self.assertEqual((1, 0), c.min_link())

        c.merge(1, 0)
        self.assertEqual([1, 1, 2], c.assignments)

        self.assertEqual((2, 1), c.min_link())

        c.merge(2, 0)
        self.assertEqual([2, 2, 2], c.assignments)
示例#26
0
def printClusters(reduced_data, algo="kmean"):
    #Dessin des donnes avec matplotlib
    clust = Clustering(reduced_data, 5)
    if (algo == "ga"):
        clust.GA(10)
    else:
        clust.kMeans()

    centroids, clusterAssment = clust.centroids, clust.clusterAssment

    cluster1X = []
    cluster1Y = []
    cluster2X = []
    cluster2Y = []
    cluster3X = []
    cluster3Y = []
    cluster4X = []
    cluster4Y = []
    cluster5X = []
    cluster5Y = []

    for i in range(len(reduced_data)):

        if (clusterAssment[i][0, 0] == 0):
            cluster1X.append(reduced_data[i, 0])
            cluster1Y.append(reduced_data[i, 1])
        if (clusterAssment[i][0, 0] == 1):
            cluster2X.append(reduced_data[i, 0])
            cluster2Y.append(reduced_data[i, 1])
        if (clusterAssment[i][0, 0] == 2):
            cluster3X.append(reduced_data[i, 0])
            cluster3Y.append(reduced_data[i, 1])
        if (clusterAssment[i][0, 0] == 3):
            cluster4X.append(reduced_data[i, 0])
            cluster4Y.append(reduced_data[i, 1])
        if (clusterAssment[i][0, 0] == 4):
            cluster5X.append(reduced_data[i, 0])
            cluster5Y.append(reduced_data[i, 1])

    plot(cluster1X, cluster1Y, 'sg')
    plot(cluster2X, cluster2Y, 'ob')
    plot(cluster3X, cluster3Y, 'or')
    plot(cluster4X, cluster4Y, 'mo')
    plot(cluster5X, cluster5Y, 'ys')

    show()
示例#27
0
def filter_repeated_hits(data):
    cl = Clustering(10, 0.1) # cluster with default values
    cl.fill_clusters(data['x1'], data['x2'])

    toDelete = [False]*len(cl.clabels)
    for icl in range(cl.ncl): # loop over clusters
        if cl.nhits[icl]>50:
            counter = 0
            for i in range(len(cl.clabels)): # loop over hits
                if cl.clabels[i]==icl:
                    counter = counter + 1
                    if counter>50:
                        toDelete[i] = True

    for key in data.keys():
        data[key][:] = [value for value,flag in zip(data[key],toDelete) if not flag]
    return data
示例#28
0
文件: dec.py 项目: arseninaanna/dec
def get_model(encoder, x):
    clusters = 10
    learning_rate = 0.01
    momentum = 0.9

    output = encoder.predict(x)
    centroids, prediction = cl.get_centroids(output, clusters)
    print("DEC: initial centroids found")

    clustering_layer = Clustering(clusters,
                                  weights=centroids,
                                  prediction=prediction,
                                  name='clustering')
    model = Sequential([encoder, clustering_layer])
    # model.compile(loss='kullback_leibler_divergence', optimizer='adadelta')
    model.compile(loss=cl.calculate_kl,
                  optimizer=SGD(lr=learning_rate, momentum=momentum))

    return model
示例#29
0
def start():
    # Set up logger
    logger = logging.getLogger('decoder')
    logger.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler = logging.StreamHandler()
    handler.setLevel(logging.ERROR)
    handler.setFormatter(formatter)
    logger.addHandler(handler)

    # Set up components
    preprocessor = Preprocessor()
    clustering = Clustering(5)
    decoder = Decoder(logger)

    _input = Input()
    filename = _input.read_file('audio/testfile3.wav')
    # filename = _input.read_file('audio/testfile4.wav')

    preprocessor.read_csv(filename)
    # preprocessor.read_csv('simulation_2018-09-27_17-13-19.csv')
    preprocessor.plot()
    preprocessor.plot(True)

    preprocessor.process_loudness()
    preprocessor.plot()
    preprocessor.plot(True)

    training_batch = preprocessor.get_batch()
    labels = clustering.train(training_batch)
    mapping = clustering.get_label_mapping()
    signals = list()

    for label in labels:
        signals.append(mapping.get(label))

    for signal in signals:
        decoder.decode(signal)

    print(decoder.message)
示例#30
0
def clustering(x, n_clusters):
    """
  Do the clustering, based on the 91 features. 
  We compute the reconstructed poses only with the following default parameters:
    method: 'K-Medians'
    distance: 'angular'
  Args:
    x: array of features
    n_clusters: number of clusters
  Output:
    new_df: the labeled dataframe, according to the clustering algorithm
    relevant_features_cs: a list with the relevant features (angles of the consecutive limbs) of the centroids
    cs: dictionary with the centroid features 
  """

    clustering_ = Clustering(k=n_clusters)
    cs, cls = clustering_.fit(x)
    d = pd.DataFrame()
    l = []
    for i in range(len(cs)):
        df1 = pd.DataFrame(cls[i])
        d = pd.concat([d, df1], sort=False)
        l += [i] * len(cls[i])

    d.columns = df.columns
    d.insert(91, 'label', l)

    new_df = df.reset_index().merge(d).set_index('index')

    assert len(cs) == n_clusters

    relevant_features_cs = []
    for i in range(len(cs)):
        d = {}
        cs_rf = cs[i][relevant_features_id]
        for k in range(len(keys_dict)):
            d[keys_dict[k]] = cs_rf[k]
        relevant_features_cs.append(d)

    return new_df, relevant_features_cs, cs