def _create_dummy_data(self, path, dataset_type):
        PathBuilder.build(path)
        dataset = None

        test_repertoire = Repertoire.build(
            sequence_aas=[
                "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH"
            ],
            v_genes=["V1-1" for i in range(5)],
            j_genes=["J1-1" for i in range(5)],
            chains=[
                Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA
            ],
            custom_lists={
                "custom_1": [f"CUST-{i}" for i in range(5)],
                "custom_2":
                [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)]
            },
            cell_ids=[1, 1, 1, 2, 2],
            path=path)

        if dataset_type == "receptor":
            receptordataset_filename = f"{path}/receptors.pkl"
            with open(receptordataset_filename, "wb") as file:
                pickle.dump(test_repertoire.receptors, file)

            dataset = ReceptorDataset(filenames=[receptordataset_filename],
                                      identifier="receptor_dataset")

        elif dataset_type == "repertoire":
            test_repertoire.identifier = "repertoire_dataset"
            dataset = RepertoireDataset(repertoires=[test_repertoire])

        return dataset
Exemplo n.º 2
0
    def test_implant_in_repertoire(self):
        path = PathBuilder.build(f"{EnvironmentSettings.tmp_test_path}full_seq_implanting/")
        signal = Signal("sig1", [Motif("motif1", GappedKmerInstantiation(max_gap=0), "AAAA")], FullSequenceImplanting())

        repertoire = Repertoire.build(["CCCC", "CCCC", "CCCC"], path=path)

        new_repertoire = signal.implant_to_repertoire(repertoire, 0.33, path)

        self.assertEqual(len(repertoire.sequences), len(new_repertoire.sequences))
        self.assertEqual(1, len([seq for seq in new_repertoire.sequences if seq.amino_acid_sequence == "AAAA"]))
        self.assertEqual(2, len([seq for seq in new_repertoire.sequences if seq.amino_acid_sequence == "CCCC"]))

        shutil.rmtree(path)
Exemplo n.º 3
0
    def load_repertoire_as_object(import_class, metadata_row, params: DatasetImportParams):
        try:
            alternative_load_func = getattr(import_class, "alternative_load_func", None)

            dataframe = ImportHelper.load_sequence_dataframe(f"{params.path}{metadata_row['filename']}", params, alternative_load_func)
            dataframe = import_class.preprocess_dataframe(dataframe, params)
            sequence_lists = {field: dataframe[field].values.tolist() for field in Repertoire.FIELDS if field in dataframe.columns}
            sequence_lists["custom_lists"] = {field: dataframe[field].values.tolist()
                                              for field in list(set(dataframe.columns) - set(Repertoire.FIELDS))}

            repertoire_inputs = {**{"metadata": metadata_row.to_dict(), "path": params.result_path + "repertoires/"}, **sequence_lists}
            repertoire = Repertoire.build(**repertoire_inputs)

            return repertoire
        except Exception as exception:
            raise RuntimeError(f"{ImportHelper.__name__}: error when importing file {metadata_row['filename']}.") from exception
Exemplo n.º 4
0
    def process_repertoire(repertoire: Repertoire, params: dict) -> Repertoire:
        data = pd.DataFrame(repertoire.load_data())

        groupby_fields = DuplicateSequenceFilter._prepare_group_by_field(
            params, data.columns)
        custom_lists = list(set(data.columns) - set(Repertoire.FIELDS))
        agg_dict = DuplicateSequenceFilter._prepare_agg_dict(
            params, data.columns, custom_lists)

        # Chain objects can not be aggregated, convert to strings
        if "chains" in data.columns:
            data["chains"] = [
                chain.value if isinstance(chain, Chain) else chain
                for chain in data["chains"]
            ]
        else:
            data["chains"] = None

        no_duplicates = data.groupby(groupby_fields).agg(
            agg_dict).reset_index()

        processed_repertoire = Repertoire.build(
            sequence_aas=list(no_duplicates["sequence_aas"])
            if "sequence_aas" in no_duplicates.columns else None,
            sequences=list(no_duplicates["sequences"])
            if "sequences" in no_duplicates.columns else None,
            v_genes=list(no_duplicates["v_genes"])
            if "v_genes" in no_duplicates.columns else None,
            j_genes=list(no_duplicates["j_genes"])
            if 'j_genes' in no_duplicates.columns else None,
            chains=[Chain(key) for key in list(no_duplicates["chains"])]
            if "chains" in no_duplicates.columns else None,
            counts=list(no_duplicates["counts"])
            if "counts" in no_duplicates else None,
            region_types=list(no_duplicates["region_types"])
            if "region_types" in no_duplicates else None,
            custom_lists={
                key: list(no_duplicates[key])
                for key in custom_lists
            },
            sequence_identifiers=list(no_duplicates["sequence_identifiers"]),
            metadata=copy.deepcopy(repertoire.metadata),
            path=params["result_path"])

        return processed_repertoire
Exemplo n.º 5
0
    def test_process(self):
        path = EnvironmentSettings.root_path + "test/tmp/duplicatesequencefilter/"
        PathBuilder.build(path)

        dataset = RepertoireDataset(repertoires=[
            Repertoire.build(
                sequence_aas=["AAA", "AAA", "CCC", "AAA", "CCC", "CCC", "CCC"],
                sequences=[
                    "ntAAA", "ntBBB", "ntCCC", "ntAAA", "ntCCC", "ntCCC",
                    "ntDDD"
                ],
                v_genes=["v1", "v1", "v1", "v1", "v1", "v1", "v1"],
                j_genes=["j1", "j1", "j1", "j1", "j1", "j1", "j1"],
                chains=[
                    Chain.ALPHA, Chain.ALPHA, Chain.ALPHA, Chain.ALPHA,
                    Chain.ALPHA, Chain.ALPHA, Chain.BETA
                ],
                counts=[10, 20, 30, 5, 20, None, 40],
                region_types=[
                    "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3",
                    "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3"
                ],
                custom_lists={
                    "custom1": ["yes", "yes", "yes", "no", "no", "no", "no"],
                    "custom2": ["yes", "yes", "yes", "no", "no", "no", "no"]
                },
                sequence_identifiers=[1, 2, 3, 4, 5, 6, 7],
                path=path)
        ])

        # collapse by amino acids & use sum counts
        dupfilter = DuplicateSequenceFilter(
            filter_sequence_type=SequenceType.AMINO_ACID,
            count_agg=CountAggregationFunction.SUM,
            batch_size=4)

        reduced_repertoire = dupfilter.process_dataset(
            dataset=dataset, result_path=path).repertoires[0]

        attr = reduced_repertoire.get_attributes([
            "sequence_identifiers", "sequence_aas", "sequences", "counts",
            "chains"
        ])

        self.assertEqual(3, len(reduced_repertoire.get_sequence_identifiers()))
        self.assertListEqual(["AAA", "CCC", "CCC"], list(attr["sequence_aas"]))
        self.assertListEqual(["ntAAA", "ntCCC", "ntDDD"],
                             list(attr["sequences"]))
        self.assertListEqual([35, 50, 40], list(attr["counts"]))
        self.assertListEqual([1, 3, 7], list(attr["sequence_identifiers"]))
        self.assertListEqual(
            [Chain.get_chain("A"),
             Chain.get_chain("A"),
             Chain.get_chain('B')], list(attr["chains"]))

        # collapse by nucleotides & use min counts
        dupfilter = DuplicateSequenceFilter(
            filter_sequence_type=SequenceType.NUCLEOTIDE,
            count_agg=CountAggregationFunction.MIN,
            batch_size=4)

        reduced_repertoire = dupfilter.process_dataset(
            dataset=dataset, result_path=path).repertoires[0]

        attr = reduced_repertoire.get_attributes(
            ["sequence_identifiers", "sequence_aas", "sequences", "counts"])

        self.assertEqual(4, len(reduced_repertoire.get_sequence_identifiers()))
        self.assertListEqual([1, 2, 3, 7], list(attr["sequence_identifiers"]))
        self.assertListEqual(["AAA", "AAA", "CCC", "CCC"],
                             list(attr["sequence_aas"]))
        self.assertListEqual(["ntAAA", "ntBBB", "ntCCC", "ntDDD"],
                             list(attr["sequences"]))
        self.assertListEqual([5, 20, 20, 40], list(attr["counts"]))

        shutil.rmtree(path)