def _create_dummy_data(self, path, dataset_type):
        PathBuilder.build(path)
        dataset = None

        test_repertoire = Repertoire.build(
            sequence_aas=[
                "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH"
            ],
            v_genes=["V1-1" for i in range(5)],
            j_genes=["J1-1" for i in range(5)],
            chains=[
                Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA
            ],
            custom_lists={
                "custom_1": [f"CUST-{i}" for i in range(5)],
                "custom_2":
                [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)]
            },
            cell_ids=[1, 1, 1, 2, 2],
            path=path)

        if dataset_type == "receptor":
            receptordataset_filename = path / "receptors.pkl"
            with open(receptordataset_filename, "wb") as file:
                pickle.dump(test_repertoire.receptors, file)

            dataset = ReceptorDataset(filenames=[receptordataset_filename],
                                      identifier="receptor_dataset")

        elif dataset_type == "repertoire":
            test_repertoire.identifier = "repertoire_dataset"
            dataset = RepertoireDataset(repertoires=[test_repertoire])

        return dataset
示例#2
0
    def process_repertoire(repertoire: Repertoire, params: dict) -> Repertoire:
        data = pd.DataFrame(repertoire.load_data())

        groupby_fields = DuplicateSequenceFilter._prepare_group_by_field(params, data.columns)
        custom_lists = list(set(data.columns) - set(Repertoire.FIELDS))
        agg_dict = DuplicateSequenceFilter._prepare_agg_dict(params, data.columns, custom_lists)

        # Chain objects can not be aggregated, convert to strings
        if "chains" in data.columns:
            data["chains"] = [chain.value if isinstance(chain, Chain) else chain for chain in data["chains"]]
        else:
            data["chains"] = None

        no_duplicates = data.groupby(groupby_fields).agg(agg_dict).reset_index()

        processed_repertoire = Repertoire.build(sequence_aas=list(no_duplicates["sequence_aas"]) if "sequence_aas" in no_duplicates.columns else None,
                                                sequences=list(no_duplicates["sequences"]) if "sequences" in no_duplicates.columns else None,
                                                v_genes=list(no_duplicates["v_genes"]) if "v_genes" in no_duplicates.columns else None,
                                                j_genes=list(no_duplicates["j_genes"]) if 'j_genes' in no_duplicates.columns else None,
                                                chains=[Chain(key) for key in list(no_duplicates["chains"])] if "chains" in no_duplicates.columns else None,
                                                counts=list(no_duplicates["counts"]) if "counts" in no_duplicates else None,
                                                region_types=list(no_duplicates["region_types"]) if "region_types" in no_duplicates else None,
                                                custom_lists={key: list(no_duplicates[key]) for key in custom_lists},
                                                sequence_identifiers=list(no_duplicates["sequence_identifiers"]),
                                                metadata=copy.deepcopy(repertoire.metadata),
                                                path=params["result_path"],
                                                filename_base=f"{repertoire.data_filename.stem}_filtered")

        return processed_repertoire
    def _create_dummy_data(self, path, dataset_type):
        PathBuilder.build(path)
        dataset = None

        test_repertoire = Repertoire.build(
            sequence_aas=[
                "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH"
            ],
            v_genes=["V1-1" for i in range(5)],
            j_genes=["J1-1" for i in range(5)],
            chains=[
                Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA
            ],
            custom_lists={
                "custom_1": [f"CUST-{i}" for i in range(5)],
                "custom_2":
                [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)]
            },
            cell_ids=["1", "1", "1", "2", '2'],
            path=path)

        if dataset_type == "receptor":

            dataset = ReceptorDataset.build_from_objects(
                test_repertoire.receptors, 100, path, name="receptor_dataset")
            dataset.identifier = 'receptor_dataset'

        elif dataset_type == "repertoire":
            test_repertoire.identifier = "repertoire_dataset"
            dataset = RepertoireDataset(repertoires=[test_repertoire])

        return dataset
    def test_implant_in_repertoire(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "full_seq_implanting/")
        signal = Signal("sig1", [Motif("motif1", GappedKmerInstantiation(max_gap=0), "AAAA")], FullSequenceImplanting())

        repertoire = Repertoire.build(["CCCC", "CCCC", "CCCC"], path=path)

        new_repertoire = signal.implant_to_repertoire(repertoire, 0.33, path)

        self.assertEqual(len(repertoire.sequences), len(new_repertoire.sequences))
        self.assertEqual(1, len([seq for seq in new_repertoire.sequences if seq.amino_acid_sequence == "AAAA"]))
        self.assertEqual(2, len([seq for seq in new_repertoire.sequences if seq.amino_acid_sequence == "CCCC"]))

        shutil.rmtree(path)
示例#5
0
    def load_repertoire_as_object(import_class, metadata_row, params: DatasetImportParams):
        try:
            alternative_load_func = getattr(import_class, "alternative_load_func", None)

            filename = params.path / f"{metadata_row['filename']}"

            dataframe = ImportHelper.load_sequence_dataframe(filename, params, alternative_load_func)
            dataframe = import_class.preprocess_dataframe(dataframe, params)
            sequence_lists = {field: dataframe[field].values.tolist() for field in Repertoire.FIELDS if field in dataframe.columns}
            sequence_lists["custom_lists"] = {field: dataframe[field].values.tolist()
                                              for field in list(set(dataframe.columns) - set(Repertoire.FIELDS))}

            repertoire_inputs = {**{"metadata": metadata_row.to_dict(),
                                    "path": params.result_path / "repertoires/",
                                    "filename_base": filename.stem}, **sequence_lists}
            repertoire = Repertoire.build(**repertoire_inputs)

            return repertoire
        except Exception as exception:
            raise RuntimeError(f"{ImportHelper.__name__}: error when importing file {metadata_row['filename']}.") from exception
    def test_process(self):
        path = EnvironmentSettings.root_path / "test/tmp/duplicatesequencefilter/"
        PathBuilder.build(path)

        dataset = RepertoireDataset(repertoires=[
            Repertoire.build(
                sequence_aas=["AAA", "AAA", "CCC", "AAA", "CCC", "CCC", "CCC"],
                sequences=[
                    "ntAAA", "ntBBB", "ntCCC", "ntAAA", "ntCCC", "ntCCC",
                    "ntDDD"
                ],
                v_genes=["v1", "v1", "v1", "v1", "v1", "v1", "v1"],
                j_genes=["j1", "j1", "j1", "j1", "j1", "j1", "j1"],
                chains=[
                    Chain.ALPHA, Chain.ALPHA, Chain.ALPHA, Chain.ALPHA,
                    Chain.ALPHA, Chain.ALPHA, Chain.BETA
                ],
                counts=[10, 20, 30, 5, 20, None, 40],
                region_types=[
                    "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3",
                    "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3"
                ],
                custom_lists={
                    "custom1": ["yes", "yes", "yes", "no", "no", "no", "no"],
                    "custom2": ["yes", "yes", "yes", "no", "no", "no", "no"]
                },
                sequence_identifiers=[1, 2, 3, 4, 5, 6, 7],
                path=path)
        ])

        # collapse by amino acids & use sum counts
        dupfilter = DuplicateSequenceFilter(
            filter_sequence_type=SequenceType.AMINO_ACID,
            count_agg=CountAggregationFunction.SUM,
            batch_size=1)

        reduced_repertoire = dupfilter.process_dataset(
            dataset=dataset, result_path=path).repertoires[0]

        attr = reduced_repertoire.get_attributes([
            "sequence_identifiers", "sequence_aas", "sequences", "counts",
            "chains"
        ])

        self.assertEqual(3, len(reduced_repertoire.get_sequence_identifiers()))
        self.assertListEqual(["AAA", "CCC", "CCC"], list(attr["sequence_aas"]))
        self.assertListEqual(["ntAAA", "ntCCC", "ntDDD"],
                             list(attr["sequences"]))
        self.assertListEqual([35, 50, 40], list(attr["counts"]))
        self.assertListEqual([1, 3, 7], list(attr["sequence_identifiers"]))
        self.assertListEqual(['ALPHA', 'ALPHA', 'BETA'], list(attr["chains"]))

        # collapse by nucleotides & use min counts
        dupfilter = DuplicateSequenceFilter(
            filter_sequence_type=SequenceType.NUCLEOTIDE,
            count_agg=CountAggregationFunction.MIN,
            batch_size=4)

        reduced_repertoire = dupfilter.process_dataset(
            dataset=dataset, result_path=path).repertoires[0]

        attr = reduced_repertoire.get_attributes(
            ["sequence_identifiers", "sequence_aas", "sequences", "counts"])

        self.assertEqual(4, len(reduced_repertoire.get_sequence_identifiers()))
        self.assertListEqual([1, 2, 3, 7], list(attr["sequence_identifiers"]))
        self.assertListEqual(["AAA", "AAA", "CCC", "CCC"],
                             list(attr["sequence_aas"]))
        self.assertListEqual(["ntAAA", "ntBBB", "ntCCC", "ntDDD"],
                             list(attr["sequences"]))
        self.assertListEqual([5, 20, 20, 40], list(attr["counts"]))

        shutil.rmtree(path)