def test_write1(self): expected_output_file_text = \ """>seq1 aaaA >seq2 aaAA >seq3 B >seq1 aaaA >seq4 aAAa >seq5 bBBbb >seq6 bbbBB >seq7 bbBBb >seq8 cccCC >seq9 ccCCc >seq10 Dd >seq11 dD """ epitopes_dataset = EpitopesDataset(EPITOPES_BATCHES_PATHS) epitopes_dataset.write(WRITE_DATASET_RES_PATH) with open(WRITE_DATASET_RES_PATH) as output_file: actual_output_file_text = output_file.read() self.assertEqual(expected_output_file_text, actual_output_file_text)
def test_count_verified_regions(self): expected_verified_regions_count = 12 epitopes_dataset = EpitopesDataset(EPITOPES_BATCHES_PATHS) actual_verified_regions_count = epitopes_dataset.count_verified_regions() self.assertEqual(expected_verified_regions_count, actual_verified_regions_count)
def load_data(self, fasta_paths_lst: List[str]): """ Loads input epitopes batch files Parameters ---------- fasta_paths_lst : List[str] List of input epitopes batch files in fasta format """ epitopes_dataset = EpitopesDataset(fasta_paths_lst) epitopes_dataset.merge_identical_seqs() self.__epitopes_dataset = epitopes_dataset
def test_merge_identical_seqs(self): expected_epitopes = \ [ add_verified_regions_lst(Epitope(SeqRecord(Seq('aaaA'))), [(2, 3), (3, 3), (1, 2)]), Epitope(SeqRecord(Seq('B'))), add_verified_regions_lst(Epitope(SeqRecord(Seq('bBBbb'))), [(3, 4), (2, 3)]), add_verified_regions_lst(Epitope(SeqRecord(Seq('cccCC'))), [(2, 3)]), add_verified_regions_lst(Epitope(SeqRecord(Seq('Dd'))), [(1, 1)]) ] epitopes_dataset = EpitopesDataset(EPITOPES_BATCHES_PATHS) epitopes_dataset.merge_identical_seqs() actual_epitopes = list(epitopes_dataset) self.assertEqual(expected_epitopes, actual_epitopes)
def get_epitopes_with_max_verified_regions( epitopes_clusters: EpitopesClusters) -> EpitopesDataset: """ Gets a dataset of the epitope records with maximum verified regions from each cluster Parameters ---------- epitopes_clusters: model.EpitopesDataset.EpitopesDataset Returns ------- max_verified_regions_epitopes_dataset : model.EpitopesDataset.EpitopesDataset Dataset of the epitope records with maximum verified regions from each cluster """ remaining_epitopes = [] for cluster in epitopes_clusters: epitope_with_max_verified_regions = cluster[0] max_verified_regions = len(cluster[0].verified_regions) for epitope in cluster: if len(epitope.verified_regions) > max_verified_regions: epitope_with_max_verified_regions = epitope max_verified_regions = len(epitope.verified_regions) remaining_epitopes.append(epitope_with_max_verified_regions) return EpitopesDataset(remaining_epitopes)
def __init__(self, docker_client: DockerClient, temp_output_dir_path: str, cd_hit_img_id: str): self.__epitopes_dataset: EpitopesDataset = EpitopesDataset([]) self.__temp_output_dir = temp_output_dir_path self.__docker_client = docker_client self.__cd_hit_img_id = cd_hit_img_id makedirs(self.__temp_output_dir, exist_ok=True)
def test_equal(self): epitopes_dataset1 = EpitopesDataset( [ Epitope(SeqRecord(Seq('a'))), Epitope(SeqRecord(Seq('A'))), Epitope(SeqRecord(Seq('aa'))), Epitope(SeqRecord(Seq('aa'))), ] ) epitopes_dataset2 = EpitopesDataset( [ Epitope(SeqRecord(Seq('aa'))), Epitope(SeqRecord(Seq('A'))), Epitope(SeqRecord(Seq('a'))), ] ) self.assertEqual(epitopes_dataset1, epitopes_dataset2)
def test_get_epitopes_with_max_verified_regions(self): expected_epitopes_dataset = EpitopesDataset([ Epitope(SeqRecord(Seq('AaAA'))), Epitope(SeqRecord(Seq('bBBBB'))), Epitope(SeqRecord(Seq('DDdDdD'))) ]) epitopes_clusters = EpitopesClusters(EPITOPES_CLUSTERS1_PATH, EPITOPES_FASTA1_PATH) actual_epitopes_dataset = get_epitopes_with_max_verified_regions( epitopes_clusters) self.assertEqual(expected_epitopes_dataset, actual_epitopes_dataset)
def split_epitopes_clusters_to_cv_datasets( epitopes_clusters: EpitopesClusters, cv_fold: int, shuffle_clusters=True) -> List[EpitopesDataset]: """ Splitting the epitopes records to CV datasets Parameters ---------- epitopes_clusters : model.EpitopesDataset.EpitopesDataset cv_fold : int shuffle_clusters : bool Returns ------- epitopes_cv_datasets : List[model.EpitopesDataset.EpitopesDataset] CV epitope records datasets """ epitopes_cv_groups = [[]] set_approximate_size = -(-epitopes_clusters.get_num_of_epitopes() // cv_fold) if shuffle_clusters: epitopes_clusters = list(epitopes_clusters) random.shuffle(epitopes_clusters) curr_groups_ind = 0 for cluster in epitopes_clusters: epitopes_cv_groups[curr_groups_ind].extend(cluster) if len(epitopes_cv_groups[curr_groups_ind]) >= set_approximate_size: epitopes_cv_groups.append([]) curr_groups_ind += 1 if len(epitopes_cv_groups[-1]) == 0: epitopes_cv_groups = epitopes_cv_groups[:-1] epitopes_cv_datasets = [ EpitopesDataset(epitopes_group) for epitopes_group in epitopes_cv_groups ] return epitopes_cv_datasets
def test_init_epitopes_dataset(self): expected_epitopes = \ [ Epitope(SeqRecord(Seq('aaaA'))), Epitope(SeqRecord(Seq('aaAA'))), Epitope(SeqRecord(Seq('B'))), Epitope(SeqRecord(Seq('aaaA'))), Epitope(SeqRecord(Seq('aAAa'))), Epitope(SeqRecord(Seq('bBBbb'))), Epitope(SeqRecord(Seq('bbbBB'))), Epitope(SeqRecord(Seq('bbBBb'))), Epitope(SeqRecord(Seq('cccCC'))), Epitope(SeqRecord(Seq('ccCCc'))), Epitope(SeqRecord(Seq('Dd'))), Epitope(SeqRecord(Seq('dD'))) ] actual_epitopes_dataset = EpitopesDataset(EPITOPES_BATCHES_PATHS) self.assertEqual(expected_epitopes, list(actual_epitopes_dataset))
def test_write2(self): expected_output_file_text = \ """>seq1 aAAA >seq3 B >seq5 bBBBB >seq8 ccCCC >seq10 DD """ epitopes_dataset = EpitopesDataset(EPITOPES_BATCHES_PATHS) epitopes_dataset.merge_identical_seqs() epitopes_dataset.write(WRITE_DATASET_RES_PATH) with open(WRITE_DATASET_RES_PATH) as output_file: actual_output_file_text = output_file.read() self.assertEqual(expected_output_file_text, actual_output_file_text)
def test_split_epitopes_clusters_to_cv_groups_cv10(self): cv_fold = 10 expected_epitopes_cv_datasets = [ EpitopesDataset([ Epitope(SeqRecord(Seq('aaaAA'))), Epitope(SeqRecord(Seq('Aaa'))), Epitope(SeqRecord(Seq('bbbBC'))), Epitope(SeqRecord(Seq('cccaaCd'))), Epitope(SeqRecord(Seq('DcDcDc'))), Epitope(SeqRecord(Seq('AAAAA'))) ]), EpitopesDataset([ Epitope(SeqRecord(Seq('aaaG'))), Epitope(SeqRecord(Seq('GCAcGcGa'))), Epitope(SeqRecord(Seq('aCGPfpc'))), Epitope(SeqRecord(Seq('cccccCCCccc'))), Epitope(SeqRecord(Seq('GgG'))), Epitope(SeqRecord(Seq('DDDDD'))) ]), EpitopesDataset([ Epitope(SeqRecord(Seq('EEeeeGGGDDD'))), Epitope(SeqRecord(Seq('BBBbbb'))), Epitope(SeqRecord(Seq('NMnMnM'))), Epitope(SeqRecord(Seq('KPkgK'))), Epitope(SeqRecord(Seq('AAAaaA'))), Epitope(SeqRecord(Seq('AAAaaaA'))) ]), EpitopesDataset([ Epitope(SeqRecord(Seq('BBBbbBbB'))), Epitope(SeqRecord(Seq('CCCccC'))), Epitope(SeqRecord(Seq('GGGggGG'))), Epitope(SeqRecord(Seq('CcCcCc'))), Epitope(SeqRecord(Seq('cCcccc'))), Epitope(SeqRecord(Seq('ccCccC'))), ]), EpitopesDataset([ Epitope(SeqRecord(Seq('CccCC'))), Epitope(SeqRecord(Seq('cccCCcC'))), Epitope(SeqRecord(Seq('CccCCCccC'))), Epitope(SeqRecord(Seq('CcccCCcccc'))), Epitope(SeqRecord(Seq('cccCCCccC'))), Epitope(SeqRecord(Seq('cccCccccc'))) ]), EpitopesDataset([ Epitope(SeqRecord(Seq('aaAAAaaAA'))), Epitope(SeqRecord(Seq('BBBBbbBB'))), Epitope(SeqRecord(Seq('bbbBBBBB'))), Epitope(SeqRecord(Seq('GGGGgg'))), Epitope(SeqRecord(Seq('GGGG'))) ]), EpitopesDataset([ Epitope(SeqRecord(Seq('TTTtTTT'))), Epitope(SeqRecord(Seq('HHHHHHhhh'))), Epitope(SeqRecord(Seq('HHHhhhhKK'))), Epitope(SeqRecord(Seq('kkkkKKKk'))), Epitope(SeqRecord(Seq('UUUuuuU'))), Epitope(SeqRecord(Seq('GFgGgF'))), Epitope(SeqRecord(Seq('CCCcCBBb'))), Epitope(SeqRecord(Seq('mmmmmmMMMmm'))), Epitope(SeqRecord(Seq('BBbb'))) ]), EpitopesDataset([ Epitope(SeqRecord(Seq('GGGg'))), Epitope(SeqRecord(Seq('AAaa'))), Epitope(SeqRecord(Seq('AAAa'))) ]) ] epitopes_clusters = EpitopesClusters(EPITOPES_CLUSTERS2_PATH, EPITOPES_FASTA2_PATH) actual_epitopes_cv_datasets = split_epitopes_clusters_to_cv_datasets( epitopes_clusters, cv_fold, shuffle_clusters=False) self.assertEqual(expected_epitopes_cv_datasets, actual_epitopes_cv_datasets)