def _make_dataset(self, path, size) -> RepertoireDataset: random_dataset = RandomDatasetGenerator.generate_repertoire_dataset(repertoire_count=size, sequence_count_probabilities={100: 1.}, sequence_length_probabilities={5: 1.}, labels={}, path=path) signals = [Signal(identifier="disease", motifs=[Motif(identifier="m1", instantiation=GappedKmerInstantiation(), seed="AAA")], implanting_strategy=HealthySequenceImplanting(implanting_computation=ImplantingComputation.ROUND, implanting=GappedMotifImplanting())), Signal(identifier="HLA", motifs=[Motif(identifier="m2", instantiation=GappedKmerInstantiation(), seed="CCC")], implanting_strategy=HealthySequenceImplanting(implanting_computation=ImplantingComputation.ROUND, implanting=GappedMotifImplanting())), Signal(identifier="age", motifs=[Motif(identifier="m3", instantiation=GappedKmerInstantiation(), seed="GGG")], implanting_strategy=HealthySequenceImplanting(implanting_computation=ImplantingComputation.ROUND, implanting=GappedMotifImplanting()))] simulation = Simulation([Implanting(dataset_implanting_rate=0.2, signals=signals, name='i1', repertoire_implanting_rate=0.25), Implanting(dataset_implanting_rate=0.2, signals=[signals[0], signals[1]], name='i2', repertoire_implanting_rate=0.25), Implanting(dataset_implanting_rate=0.1, signals=[signals[0]], name='i3', repertoire_implanting_rate=0.25), Implanting(dataset_implanting_rate=0.2, signals=[signals[2]], name='i4', repertoire_implanting_rate=0.25), Implanting(dataset_implanting_rate=0.1, signals=[signals[1]], name='i5', repertoire_implanting_rate=0.25) ]) dataset = SignalImplanter.run(SimulationState(signals=signals, dataset=random_dataset, formats=['Pickle'], result_path=path, name='my_synthetic_dataset', simulation=simulation)) return dataset
def test_run_with_receptors(self): path = PathBuilder.build(EnvironmentSettings.root_path / "test/tmp/signalImplanter_receptor/") dataset = RandomDatasetGenerator.generate_receptor_dataset( 10, {10: 1}, {12: 1}, {}, path / "dataset/") motif1 = Motif(identifier="motif1", instantiation=GappedKmerInstantiation(), seed_chain1="AAA", name_chain1=Chain.ALPHA, seed_chain2="CCC", name_chain2=Chain.BETA) signal1 = Signal(identifier="signal1", motifs=[motif1], implanting_strategy=ReceptorImplanting( GappedMotifImplanting())) simulation = Simulation( [Implanting(dataset_implanting_rate=0.5, signals=[signal1])]) sim_state = SimulationState(dataset=dataset, result_path=path, simulation=simulation, signals=[signal1], formats=["ImmuneML"]) new_dataset = SignalImplanter.run(sim_state) self.assertEqual(10, new_dataset.get_example_count()) self.assertEqual( 5, len([ receptor for receptor in new_dataset.get_data(40) if receptor.metadata["signal1"] is True ])) shutil.rmtree(path)
def _parse_simulation(key: str, simulation: dict, symbol_table: SymbolTable) -> SymbolTable: location = "SimulationParser" valid_implanting_keys = ["dataset_implanting_rate", "repertoire_implanting_rate", "signals", "is_noise"] implantings = [] for impl_key, implanting in simulation.items(): ParameterValidator.assert_keys(implanting.keys(), valid_implanting_keys, location, impl_key, exclusive=False) ParameterValidator.assert_keys(implanting["signals"], symbol_table.get_keys_by_type(SymbolType.SIGNAL), location, impl_key, False) implanting_params = copy.deepcopy(implanting) implanting_params["signals"] = [symbol_table.get(signal) for signal in implanting["signals"]] implanting_params["name"] = impl_key implantings.append(Implanting(**implanting_params)) assert sum([settings["dataset_implanting_rate"] for settings in simulation.values()]) <= 1, \ "The total dataset implanting rate can not exceed 1." symbol_table.add(key, SymbolType.SIMULATION, Simulation(implantings)) return symbol_table
def test_run(self): r = [] path = EnvironmentSettings.tmp_test_path / "signalImplanter/" if not os.path.isdir(path): os.makedirs(path) sequences = [ ReceptorSequence("ACDEFG", identifier="1"), ReceptorSequence("ACDEFG", identifier="2"), ReceptorSequence("ACDEFG", identifier="3"), ReceptorSequence("ACDEFG", identifier="4") ] for i in range(10): rep = Repertoire.build_from_sequence_objects( sequence_objects=sequences, path=path, metadata={}) r.append(rep) dataset = RepertoireDataset(repertoires=r) m1 = Motif(identifier="m1", instantiation=GappedKmerInstantiation(), seed="CAS") m2 = Motif(identifier="m2", instantiation=GappedKmerInstantiation(), seed="CCC") s1 = Signal(identifier="s1", motifs=[m1], implanting_strategy=HealthySequenceImplanting( GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND)) s2 = Signal(identifier="s2", motifs=[m1, m2], implanting_strategy=HealthySequenceImplanting( GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND)) simulation = Simulation([ Implanting(dataset_implanting_rate=0.2, repertoire_implanting_rate=0.5, signals=[s1, s2], name="i1"), Implanting(dataset_implanting_rate=0.2, repertoire_implanting_rate=0.5, signals=[s2], name="i2") ]) input_params = SimulationState(dataset=dataset, result_path=path, simulation=simulation, signals=[s1, s2], formats=["ImmuneML"]) new_dataset = SignalImplanter.run(input_params) reps_with_s2 = sum([ rep.metadata[s2.id] is True for rep in new_dataset.get_data(batch_size=10) ]) reps_with_s1 = sum([ rep.metadata[s1.id] is True for rep in new_dataset.get_data(batch_size=10) ]) self.assertEqual(10, len(new_dataset.get_example_ids())) self.assertTrue( all([ s1.id in rep.metadata.keys() for rep in new_dataset.get_data(batch_size=10) ])) self.assertTrue( all([ s2.id in rep.metadata.keys() for rep in new_dataset.get_data(batch_size=10) ])) self.assertTrue(reps_with_s2 == 4) self.assertTrue(reps_with_s1 == 2) self.assertEqual(10, len(new_dataset.get_example_ids())) metadata_filenames = [ filename.name for filename in new_dataset.get_filenames() ] self.assertTrue( all([ repertoire.data_filename.name in metadata_filenames for repertoire in new_dataset.repertoires ])) shutil.rmtree(path)