def _make_dataset(self, path, size) -> RepertoireDataset:

        random_dataset = RandomDatasetGenerator.generate_repertoire_dataset(repertoire_count=size, sequence_count_probabilities={100: 1.},
                                                                            sequence_length_probabilities={5: 1.}, labels={}, path=path)

        signals = [Signal(identifier="disease", motifs=[Motif(identifier="m1", instantiation=GappedKmerInstantiation(), seed="AAA")],
                          implanting_strategy=HealthySequenceImplanting(implanting_computation=ImplantingComputation.ROUND,
                                                                        implanting=GappedMotifImplanting())),
                   Signal(identifier="HLA", motifs=[Motif(identifier="m2", instantiation=GappedKmerInstantiation(), seed="CCC")],
                          implanting_strategy=HealthySequenceImplanting(implanting_computation=ImplantingComputation.ROUND,
                                                                        implanting=GappedMotifImplanting())),
                   Signal(identifier="age", motifs=[Motif(identifier="m3", instantiation=GappedKmerInstantiation(), seed="GGG")],
                          implanting_strategy=HealthySequenceImplanting(implanting_computation=ImplantingComputation.ROUND,
                                                                        implanting=GappedMotifImplanting()))]

        simulation = Simulation([Implanting(dataset_implanting_rate=0.2, signals=signals, name='i1', repertoire_implanting_rate=0.25),
                                 Implanting(dataset_implanting_rate=0.2, signals=[signals[0], signals[1]], name='i2', repertoire_implanting_rate=0.25),
                                 Implanting(dataset_implanting_rate=0.1, signals=[signals[0]], name='i3', repertoire_implanting_rate=0.25),
                                 Implanting(dataset_implanting_rate=0.2, signals=[signals[2]], name='i4', repertoire_implanting_rate=0.25),
                                 Implanting(dataset_implanting_rate=0.1, signals=[signals[1]], name='i5', repertoire_implanting_rate=0.25)
                                 ])

        dataset = SignalImplanter.run(SimulationState(signals=signals, dataset=random_dataset, formats=['Pickle'], result_path=path,
                                                      name='my_synthetic_dataset', simulation=simulation))

        return dataset
Пример #2
0
    def test_run_with_receptors(self):

        path = PathBuilder.build(EnvironmentSettings.root_path /
                                 "test/tmp/signalImplanter_receptor/")

        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            10, {10: 1}, {12: 1}, {}, path / "dataset/")
        motif1 = Motif(identifier="motif1",
                       instantiation=GappedKmerInstantiation(),
                       seed_chain1="AAA",
                       name_chain1=Chain.ALPHA,
                       seed_chain2="CCC",
                       name_chain2=Chain.BETA)
        signal1 = Signal(identifier="signal1",
                         motifs=[motif1],
                         implanting_strategy=ReceptorImplanting(
                             GappedMotifImplanting()))

        simulation = Simulation(
            [Implanting(dataset_implanting_rate=0.5, signals=[signal1])])

        sim_state = SimulationState(dataset=dataset,
                                    result_path=path,
                                    simulation=simulation,
                                    signals=[signal1],
                                    formats=["ImmuneML"])

        new_dataset = SignalImplanter.run(sim_state)

        self.assertEqual(10, new_dataset.get_example_count())
        self.assertEqual(
            5,
            len([
                receptor for receptor in new_dataset.get_data(40)
                if receptor.metadata["signal1"] is True
            ]))

        shutil.rmtree(path)
Пример #3
0
    def _parse_simulation(key: str, simulation: dict, symbol_table: SymbolTable) -> SymbolTable:

        location = "SimulationParser"
        valid_implanting_keys = ["dataset_implanting_rate", "repertoire_implanting_rate", "signals", "is_noise"]
        implantings = []

        for impl_key, implanting in simulation.items():

            ParameterValidator.assert_keys(implanting.keys(), valid_implanting_keys, location, impl_key, exclusive=False)
            ParameterValidator.assert_keys(implanting["signals"], symbol_table.get_keys_by_type(SymbolType.SIGNAL), location, impl_key, False)

            implanting_params = copy.deepcopy(implanting)
            implanting_params["signals"] = [symbol_table.get(signal) for signal in implanting["signals"]]
            implanting_params["name"] = impl_key

            implantings.append(Implanting(**implanting_params))

        assert sum([settings["dataset_implanting_rate"] for settings in simulation.values()]) <= 1, \
            "The total dataset implanting rate can not exceed 1."

        symbol_table.add(key, SymbolType.SIMULATION, Simulation(implantings))

        return symbol_table
Пример #4
0
    def test_run(self):

        r = []

        path = EnvironmentSettings.tmp_test_path / "signalImplanter/"

        if not os.path.isdir(path):
            os.makedirs(path)

        sequences = [
            ReceptorSequence("ACDEFG", identifier="1"),
            ReceptorSequence("ACDEFG", identifier="2"),
            ReceptorSequence("ACDEFG", identifier="3"),
            ReceptorSequence("ACDEFG", identifier="4")
        ]

        for i in range(10):
            rep = Repertoire.build_from_sequence_objects(
                sequence_objects=sequences, path=path, metadata={})
            r.append(rep)

        dataset = RepertoireDataset(repertoires=r)

        m1 = Motif(identifier="m1",
                   instantiation=GappedKmerInstantiation(),
                   seed="CAS")
        m2 = Motif(identifier="m2",
                   instantiation=GappedKmerInstantiation(),
                   seed="CCC")
        s1 = Signal(identifier="s1",
                    motifs=[m1],
                    implanting_strategy=HealthySequenceImplanting(
                        GappedMotifImplanting(),
                        implanting_computation=ImplantingComputation.ROUND))
        s2 = Signal(identifier="s2",
                    motifs=[m1, m2],
                    implanting_strategy=HealthySequenceImplanting(
                        GappedMotifImplanting(),
                        implanting_computation=ImplantingComputation.ROUND))

        simulation = Simulation([
            Implanting(dataset_implanting_rate=0.2,
                       repertoire_implanting_rate=0.5,
                       signals=[s1, s2],
                       name="i1"),
            Implanting(dataset_implanting_rate=0.2,
                       repertoire_implanting_rate=0.5,
                       signals=[s2],
                       name="i2")
        ])

        input_params = SimulationState(dataset=dataset,
                                       result_path=path,
                                       simulation=simulation,
                                       signals=[s1, s2],
                                       formats=["ImmuneML"])

        new_dataset = SignalImplanter.run(input_params)
        reps_with_s2 = sum([
            rep.metadata[s2.id] is True
            for rep in new_dataset.get_data(batch_size=10)
        ])
        reps_with_s1 = sum([
            rep.metadata[s1.id] is True
            for rep in new_dataset.get_data(batch_size=10)
        ])
        self.assertEqual(10, len(new_dataset.get_example_ids()))
        self.assertTrue(
            all([
                s1.id in rep.metadata.keys()
                for rep in new_dataset.get_data(batch_size=10)
            ]))
        self.assertTrue(
            all([
                s2.id in rep.metadata.keys()
                for rep in new_dataset.get_data(batch_size=10)
            ]))
        self.assertTrue(reps_with_s2 == 4)
        self.assertTrue(reps_with_s1 == 2)

        self.assertEqual(10, len(new_dataset.get_example_ids()))

        metadata_filenames = [
            filename.name for filename in new_dataset.get_filenames()
        ]
        self.assertTrue(
            all([
                repertoire.data_filename.name in metadata_filenames
                for repertoire in new_dataset.repertoires
            ]))

        shutil.rmtree(path)