def test_write1(self):
        expected_output_file_text = \
            """>seq1
aaaA
>seq2
aaAA
>seq3
B
>seq1
aaaA
>seq4
aAAa
>seq5
bBBbb
>seq6
bbbBB
>seq7
bbBBb
>seq8
cccCC
>seq9
ccCCc
>seq10
Dd
>seq11
dD
"""

        epitopes_dataset = EpitopesDataset(EPITOPES_BATCHES_PATHS)
        epitopes_dataset.write(WRITE_DATASET_RES_PATH)
        with open(WRITE_DATASET_RES_PATH) as output_file:
            actual_output_file_text = output_file.read()

        self.assertEqual(expected_output_file_text,
                         actual_output_file_text)
    def test_count_verified_regions(self):
        expected_verified_regions_count = 12

        epitopes_dataset = EpitopesDataset(EPITOPES_BATCHES_PATHS)
        actual_verified_regions_count = epitopes_dataset.count_verified_regions()

        self.assertEqual(expected_verified_regions_count, actual_verified_regions_count)
示例#3
0
    def load_data(self, fasta_paths_lst: List[str]):
        """
        Loads input epitopes batch files

        Parameters
        ----------
        fasta_paths_lst : List[str]
            List of input epitopes batch files in fasta format
        """
        epitopes_dataset = EpitopesDataset(fasta_paths_lst)
        epitopes_dataset.merge_identical_seqs()
        self.__epitopes_dataset = epitopes_dataset
    def test_merge_identical_seqs(self):
        expected_epitopes = \
            [
                add_verified_regions_lst(Epitope(SeqRecord(Seq('aaaA'))), [(2, 3), (3, 3), (1, 2)]),
                Epitope(SeqRecord(Seq('B'))),
                add_verified_regions_lst(Epitope(SeqRecord(Seq('bBBbb'))), [(3, 4), (2, 3)]),
                add_verified_regions_lst(Epitope(SeqRecord(Seq('cccCC'))), [(2, 3)]),
                add_verified_regions_lst(Epitope(SeqRecord(Seq('Dd'))), [(1, 1)])
            ]

        epitopes_dataset = EpitopesDataset(EPITOPES_BATCHES_PATHS)
        epitopes_dataset.merge_identical_seqs()
        actual_epitopes = list(epitopes_dataset)

        self.assertEqual(expected_epitopes, actual_epitopes)
def get_epitopes_with_max_verified_regions(
        epitopes_clusters: EpitopesClusters) -> EpitopesDataset:
    """
    Gets a dataset of the epitope records with maximum verified regions from each cluster
    Parameters
    ----------
    epitopes_clusters: model.EpitopesDataset.EpitopesDataset

    Returns
    -------
    max_verified_regions_epitopes_dataset : model.EpitopesDataset.EpitopesDataset
        Dataset of the epitope records with maximum verified regions from each cluster
    """
    remaining_epitopes = []

    for cluster in epitopes_clusters:
        epitope_with_max_verified_regions = cluster[0]
        max_verified_regions = len(cluster[0].verified_regions)

        for epitope in cluster:
            if len(epitope.verified_regions) > max_verified_regions:
                epitope_with_max_verified_regions = epitope
                max_verified_regions = len(epitope.verified_regions)

        remaining_epitopes.append(epitope_with_max_verified_regions)

    return EpitopesDataset(remaining_epitopes)
示例#6
0
    def __init__(self, docker_client: DockerClient, temp_output_dir_path: str, cd_hit_img_id: str):
        self.__epitopes_dataset: EpitopesDataset = EpitopesDataset([])
        self.__temp_output_dir = temp_output_dir_path
        self.__docker_client = docker_client
        self.__cd_hit_img_id = cd_hit_img_id

        makedirs(self.__temp_output_dir, exist_ok=True)
    def test_equal(self):
        epitopes_dataset1 = EpitopesDataset(
            [
                Epitope(SeqRecord(Seq('a'))),
                Epitope(SeqRecord(Seq('A'))),
                Epitope(SeqRecord(Seq('aa'))),
                Epitope(SeqRecord(Seq('aa'))),
            ]
        )

        epitopes_dataset2 = EpitopesDataset(
            [
                Epitope(SeqRecord(Seq('aa'))),
                Epitope(SeqRecord(Seq('A'))),
                Epitope(SeqRecord(Seq('a'))),
            ]
        )

        self.assertEqual(epitopes_dataset1, epitopes_dataset2)
    def test_get_epitopes_with_max_verified_regions(self):
        expected_epitopes_dataset = EpitopesDataset([
            Epitope(SeqRecord(Seq('AaAA'))),
            Epitope(SeqRecord(Seq('bBBBB'))),
            Epitope(SeqRecord(Seq('DDdDdD')))
        ])

        epitopes_clusters = EpitopesClusters(EPITOPES_CLUSTERS1_PATH,
                                             EPITOPES_FASTA1_PATH)
        actual_epitopes_dataset = get_epitopes_with_max_verified_regions(
            epitopes_clusters)

        self.assertEqual(expected_epitopes_dataset, actual_epitopes_dataset)
def split_epitopes_clusters_to_cv_datasets(
        epitopes_clusters: EpitopesClusters,
        cv_fold: int,
        shuffle_clusters=True) -> List[EpitopesDataset]:
    """
    Splitting the epitopes records to CV datasets

    Parameters
    ----------
    epitopes_clusters : model.EpitopesDataset.EpitopesDataset

    cv_fold : int
    shuffle_clusters : bool

    Returns
    -------
    epitopes_cv_datasets : List[model.EpitopesDataset.EpitopesDataset]
        CV epitope records datasets
    """
    epitopes_cv_groups = [[]]

    set_approximate_size = -(-epitopes_clusters.get_num_of_epitopes() //
                             cv_fold)

    if shuffle_clusters:
        epitopes_clusters = list(epitopes_clusters)
        random.shuffle(epitopes_clusters)

    curr_groups_ind = 0
    for cluster in epitopes_clusters:
        epitopes_cv_groups[curr_groups_ind].extend(cluster)
        if len(epitopes_cv_groups[curr_groups_ind]) >= set_approximate_size:
            epitopes_cv_groups.append([])
            curr_groups_ind += 1

    if len(epitopes_cv_groups[-1]) == 0:
        epitopes_cv_groups = epitopes_cv_groups[:-1]

    epitopes_cv_datasets = [
        EpitopesDataset(epitopes_group)
        for epitopes_group in epitopes_cv_groups
    ]

    return epitopes_cv_datasets
    def test_init_epitopes_dataset(self):
        expected_epitopes = \
            [
                Epitope(SeqRecord(Seq('aaaA'))),
                Epitope(SeqRecord(Seq('aaAA'))),
                Epitope(SeqRecord(Seq('B'))),
                Epitope(SeqRecord(Seq('aaaA'))),
                Epitope(SeqRecord(Seq('aAAa'))),
                Epitope(SeqRecord(Seq('bBBbb'))),
                Epitope(SeqRecord(Seq('bbbBB'))),
                Epitope(SeqRecord(Seq('bbBBb'))),
                Epitope(SeqRecord(Seq('cccCC'))),
                Epitope(SeqRecord(Seq('ccCCc'))),
                Epitope(SeqRecord(Seq('Dd'))),
                Epitope(SeqRecord(Seq('dD')))
            ]

        actual_epitopes_dataset = EpitopesDataset(EPITOPES_BATCHES_PATHS)

        self.assertEqual(expected_epitopes, list(actual_epitopes_dataset))
    def test_write2(self):
        expected_output_file_text = \
            """>seq1
aAAA
>seq3
B
>seq5
bBBBB
>seq8
ccCCC
>seq10
DD
"""

        epitopes_dataset = EpitopesDataset(EPITOPES_BATCHES_PATHS)
        epitopes_dataset.merge_identical_seqs()
        epitopes_dataset.write(WRITE_DATASET_RES_PATH)
        with open(WRITE_DATASET_RES_PATH) as output_file:
            actual_output_file_text = output_file.read()

        self.assertEqual(expected_output_file_text,
                         actual_output_file_text)
    def test_split_epitopes_clusters_to_cv_groups_cv10(self):
        cv_fold = 10
        expected_epitopes_cv_datasets = [
            EpitopesDataset([
                Epitope(SeqRecord(Seq('aaaAA'))),
                Epitope(SeqRecord(Seq('Aaa'))),
                Epitope(SeqRecord(Seq('bbbBC'))),
                Epitope(SeqRecord(Seq('cccaaCd'))),
                Epitope(SeqRecord(Seq('DcDcDc'))),
                Epitope(SeqRecord(Seq('AAAAA')))
            ]),
            EpitopesDataset([
                Epitope(SeqRecord(Seq('aaaG'))),
                Epitope(SeqRecord(Seq('GCAcGcGa'))),
                Epitope(SeqRecord(Seq('aCGPfpc'))),
                Epitope(SeqRecord(Seq('cccccCCCccc'))),
                Epitope(SeqRecord(Seq('GgG'))),
                Epitope(SeqRecord(Seq('DDDDD')))
            ]),
            EpitopesDataset([
                Epitope(SeqRecord(Seq('EEeeeGGGDDD'))),
                Epitope(SeqRecord(Seq('BBBbbb'))),
                Epitope(SeqRecord(Seq('NMnMnM'))),
                Epitope(SeqRecord(Seq('KPkgK'))),
                Epitope(SeqRecord(Seq('AAAaaA'))),
                Epitope(SeqRecord(Seq('AAAaaaA')))
            ]),
            EpitopesDataset([
                Epitope(SeqRecord(Seq('BBBbbBbB'))),
                Epitope(SeqRecord(Seq('CCCccC'))),
                Epitope(SeqRecord(Seq('GGGggGG'))),
                Epitope(SeqRecord(Seq('CcCcCc'))),
                Epitope(SeqRecord(Seq('cCcccc'))),
                Epitope(SeqRecord(Seq('ccCccC'))),
            ]),
            EpitopesDataset([
                Epitope(SeqRecord(Seq('CccCC'))),
                Epitope(SeqRecord(Seq('cccCCcC'))),
                Epitope(SeqRecord(Seq('CccCCCccC'))),
                Epitope(SeqRecord(Seq('CcccCCcccc'))),
                Epitope(SeqRecord(Seq('cccCCCccC'))),
                Epitope(SeqRecord(Seq('cccCccccc')))
            ]),
            EpitopesDataset([
                Epitope(SeqRecord(Seq('aaAAAaaAA'))),
                Epitope(SeqRecord(Seq('BBBBbbBB'))),
                Epitope(SeqRecord(Seq('bbbBBBBB'))),
                Epitope(SeqRecord(Seq('GGGGgg'))),
                Epitope(SeqRecord(Seq('GGGG')))
            ]),
            EpitopesDataset([
                Epitope(SeqRecord(Seq('TTTtTTT'))),
                Epitope(SeqRecord(Seq('HHHHHHhhh'))),
                Epitope(SeqRecord(Seq('HHHhhhhKK'))),
                Epitope(SeqRecord(Seq('kkkkKKKk'))),
                Epitope(SeqRecord(Seq('UUUuuuU'))),
                Epitope(SeqRecord(Seq('GFgGgF'))),
                Epitope(SeqRecord(Seq('CCCcCBBb'))),
                Epitope(SeqRecord(Seq('mmmmmmMMMmm'))),
                Epitope(SeqRecord(Seq('BBbb')))
            ]),
            EpitopesDataset([
                Epitope(SeqRecord(Seq('GGGg'))),
                Epitope(SeqRecord(Seq('AAaa'))),
                Epitope(SeqRecord(Seq('AAAa')))
            ])
        ]

        epitopes_clusters = EpitopesClusters(EPITOPES_CLUSTERS2_PATH,
                                             EPITOPES_FASTA2_PATH)
        actual_epitopes_cv_datasets = split_epitopes_clusters_to_cv_datasets(
            epitopes_clusters, cv_fold, shuffle_clusters=False)

        self.assertEqual(expected_epitopes_cv_datasets,
                         actual_epitopes_cv_datasets)