Пример #1
0
 def test_matrix(self):
     vdj = datasets.vdjdb_beta()
     max_sequence_size = vdj.str.len().max()
     train = vdj.sample(2000)
     times = 3
     size_per_time = 3000
     clustering = Clustering(faiss_training_data=train,
                             fitting_data_size=times * size_per_time,
                             max_sequence_size=max_sequence_size)
     for i in range(times):
         sample = vdj.sample(size_per_time)
         clustering.batch_precluster(sample, name=f'time {i}')
     for clusters in clustering.batch_cluster(calc_cluster_matrix=True):
         df = clusters.clusters_df
     clustering.batch_cluster_matrix()
     clustering.batch_cleanup()
Пример #2
0
 def test_batch_clustering_multiprocessing(self):
     vdj = datasets.vdjdb_beta()
     max_sequence_size = vdj.str.len().max()
     train = vdj.sample(2000)
     times = 3
     size_per_time = 3000
     clustering = Clustering(faiss_training_data=train,
                             fitting_data_size=times * size_per_time,
                             max_sequence_size=max_sequence_size,
                             n_cpus='all')
     for i in range(times):
         sample = vdj.sample(size_per_time)
         clustering.batch_precluster(sample)
     for clusters in clustering.batch_cluster():
         df = clusters.clusters_df
     clustering.batch_cleanup()
Пример #3
0
def evaluate_distance_metrics(start,
                              end,
                              step_size,
                              replicates,
                              filename=None):
    final = pd.DataFrame()
    for n in range(start, end, step_size):
        print('###################')
        print(n)
        print('###################')
        for i in range(replicates):

            try:
                beta = datasets.vdjdb_beta().sample(n)
            except ValueError:
                break

            epi = datasets.vdjdb_beta(epitopes=True)
            epi = epi[epi.CDR3.isin(beta)]

            t = time.time()
            out_HD = Clustering(method='two-step',
                                distance_metric='HAMMING').fit(beta)
            t_hd = time.time() - t

            t = time.time()
            out_LD = Clustering(method='two-step',
                                distance_metric='LEVENSHTEIN').fit(beta)
            t_ld = time.time() - t

            summ_HD = out_HD.metrics(epi).summary()
            summ_HD['n'] = n
            summ_HD['dm'] = 'Hamming'
            summ_HD['t'] = t_hd
            summ_LD = out_LD.metrics(epi).summary()
            summ_LD['n'] = n
            summ_LD['dm'] = 'Levenshtein'
            summ_LD['t'] = t_ld
            final = final.append(summ_HD)
            final = final.append(summ_LD)

        if filename is not None:
            final.to_csv(join('./results/', filename), sep='\t', index=False)

    return final
Пример #4
0
class ClusteringTest(TestBase):

    def setUp(self):
        self.cdr3 = datasets.test_cdr3()
        self.epitopes = datasets.test_epitopes()
        self.clustering_result = Clustering().fit(self.cdr3)

    def make_features(self):
        return self.clustering_result.compute_features(compute_pgen=True)

    def test_feature_generation(self):
        self.make_features()

    def test_pca(self):
        ClusterAnalysis(self.make_features()).pca()

    def test_prediction(self):
        ClusterAnalysis(self.make_features()).predict_quality()

    def test_train_model(self):
        model = ModelTraining(self.clustering_result.clusters_df, self.epitopes)
        fitted = model.fit_data()
        model.evaluate()
        model.save(fitted, 'test.pkl')
Пример #5
0
 def test_summary(self):
     Clustering().fit(self.cdr3).summary()
Пример #6
0
 def test_metrics(self):
     metrics = Clustering().fit(self.cdr3).metrics(self.epitopes)
     metrics.purity()
     metrics.consistency()
     metrics.retention()
     metrics.purity_90()
     metrics.summary()
Пример #7
0
 def test_faiss_cluster_size(self):
     for size in range(2, 6003, 2000):
         for method in ['two-step', 'faiss', 'mcl']:
             Clustering(method=method, faiss_cluster_size=size).fit(self.cdr3)
Пример #8
0
 def test_multiprocessing(self):
     for cpu in [-1, 0, 1, 2, 'all']:
         for method in ['two-step', 'faiss', 'mcl']:
             Clustering(method=method, n_cpus=cpu).fit(self.cdr3)
Пример #9
0
 def test_alphabeta(self):
     df = datasets.vdjdb_paired()
     alpha, beta = df['CDR3_alpha'], df['CDR3_beta']
     Clustering().fit(beta, alpha=alpha)
Пример #10
0
 def test_faiss(self):
     Clustering(method='faiss').fit(self.cdr3)
Пример #11
0
 def test_mcl(self):
     Clustering(method='mcl').fit(self.cdr3)
Пример #12
0
 def test_quality(self):
     metrics = Clustering().fit(self.cdr3).metrics(self.epitopes)
     self.assertGreater(metrics.purity()[0], 0.6)
     self.assertGreater(metrics.consistency()[0], 0.12)
     self.assertGreater(metrics.retention(), 0.21)
     self.assertGreater(metrics.purity_90()[0], 0.36)
Пример #13
0
 def test_cluster_contents(self):
     Clustering().fit(self.cdr3).cluster_contents()
Пример #14
0
 def test_write_to_csv(self):
     Clustering().fit(self.cdr3).write_to_csv()
Пример #15
0
 def test_normal(self):
     Clustering().fit(self.cdr3)
Пример #16
0
 def setUp(self):
     self.cdr3 = datasets.test_cdr3()
     self.epitopes = datasets.test_epitopes()
     self.clustering_result = Clustering().fit(self.cdr3)