def create_dataset(self, path, dataset_size: int = 50): sequences = [] for i in range(dataset_size): if i % 2 == 0: sequences.append( ReceptorSequence( amino_acid_sequence="AAACCC", identifier=str(i), metadata=SequenceMetadata(custom_params={"l1": 1}))) else: sequences.append( ReceptorSequence( amino_acid_sequence="ACACAC", identifier=str(i), metadata=SequenceMetadata(custom_params={"l1": 2}))) PathBuilder.build(path) filename = "{}sequences.pkl".format(path) with open(filename, "wb") as file: pickle.dump(sequences, file) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) dataset = SequenceDataset(params={"l1": [1, 2]}, filenames=[filename], identifier="d1") return dataset
def test_encode_sequence(self): seq = ReceptorSequence(amino_acid_sequence="CASSVFRTY") result = KmerSequenceEncoder.encode_sequence( seq, EncoderParams(model={"k": 3}, label_config=LabelConfiguration(), result_path="", pool_size=4)) self.assertTrue("CAS" in result) self.assertTrue("ASS" in result) self.assertTrue("SSV" in result) self.assertTrue("SVF" in result) self.assertTrue("VFR" in result) self.assertTrue("FRT" in result) self.assertTrue("RTY" in result) self.assertEqual(7, len(result)) self.assertEqual( KmerSequenceEncoder.encode_sequence( ReceptorSequence(amino_acid_sequence="AC"), EncoderParams(model={"k": 3}, label_config=LabelConfiguration(), result_path="", pool_size=4)), None)
def create_dummy_sequencedataset(self, path): sequences = [ ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1a", metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1", chain=Chain.ALPHA, frame_type="IN", custom_params={ "d_call": "TRAD1", "custom1": "cust1" })), ReceptorSequence(amino_acid_sequence="ATATAT", identifier="1b", metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN", custom_params={ "d_call": "TRBD1", "custom2": "cust1" })), ReceptorSequence(amino_acid_sequence="ATATAT", identifier="2b", metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN", custom_params={ "d_call": "TRBD1", "custom2": "cust1" })) ] return SequenceDataset.build(sequences, 2, "{}sequences".format(path))
def create_IMGT_gapped_kmers_from_sequence(sequence: ReceptorSequence, k_left: int, max_gap: int, k_right: int = None, min_gap: int = 0): positions = PositionHelper.gen_imgt_positions_from_length( len(sequence.get_sequence())) sequence_w_pos = list(zip(list(sequence.get_sequence()), positions)) kmers = KmerHelper.create_gapped_kmers_from_string(sequence_w_pos, k_left=k_left, max_gap=max_gap, k_right=k_right, min_gap=min_gap) if kmers is not None: kmers = [( ''.join([x[0] if isinstance(x, tuple) else x for x in kmer]), min([i[1] if isinstance(i, tuple) else 1000 for i in kmer]) if int(min([i[1] if isinstance(i, tuple) else 1000 for i in kmer])) != 112 else max([ i[1] if isinstance(i, tuple) else 0 for i in kmer if int(i[1] if isinstance(i, tuple) else 0) == 112 ])) for kmer in kmers] return kmers else: return None
def test_create_model(self): test_path = EnvironmentSettings.root_path + "test/tmp/w2v_test_tmp/" PathBuilder.build(test_path) sequence1 = ReceptorSequence("CASSVFA") sequence2 = ReceptorSequence("CASSCCC") metadata1 = {"T1D": "T1D", "subject_id": "1"} rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2], test_path, metadata1) metadata2 = {"T1D": "CTL", "subject_id": "2"} rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path, metadata2) dataset = RepertoireDataset(repertoires=[rep1, rep2]) model_creator = KmerPairModelCreator() model = model_creator.create_model(dataset=dataset, k=2, vector_size=16, batch_size=1, model_path=test_path + "model.model") self.assertTrue(isinstance(model, Word2Vec)) self.assertTrue("CA" in model.wv.vocab) self.assertEqual(400, len(model.wv.vocab)) shutil.rmtree(test_path)
def test_implant_in_repertoire(self): path = EnvironmentSettings.tmp_test_path + "healthysequenceimplanting/" PathBuilder.build(path) repertoire = Repertoire.build_from_sequence_objects( [ ReceptorSequence(amino_acid_sequence="ACDFQ", identifier="1"), ReceptorSequence(amino_acid_sequence="TGCDF", identifier="2") ], path=path, metadata={"subject_id": "1"}) implanting = HealthySequenceImplanting( GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND) signal = Signal("1", [Motif("m1", GappedKmerInstantiation(), "CCC")], implanting) repertoire2 = implanting.implant_in_repertoire(repertoire, 0.5, signal, path) new_sequences = [ sequence.get_sequence() for sequence in repertoire2.sequences ] self.assertTrue("ACDFQ" in new_sequences or "TGCDF" in new_sequences) self.assertTrue(any(["CCC" in sequence for sequence in new_sequences])) shutil.rmtree(path)
def test_create_kmers_from_sequence(self): kmers = KmerHelper.create_kmers_from_sequence(ReceptorSequence(amino_acid_sequence="ABCDEFG"), 3) self.assertTrue("ABC" in kmers and "BCD" in kmers and "CDE" in kmers and "DEF" in kmers and "EFG" in kmers) self.assertEqual(5, len(kmers)) kmers = KmerHelper.create_kmers_from_sequence(ReceptorSequence(amino_acid_sequence="AB"), 3) self.assertTrue(len(kmers) == 0)
def test_encode_sequence(self): sequence = ReceptorSequence( amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="OUT")) enc = IdentitySequenceEncoder() self.assertEqual( enc.encode_sequence( sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")), ["AAA"]) sequence = ReceptorSequence( amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="STOP")) enc = IdentitySequenceEncoder() self.assertEqual( enc.encode_sequence( sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")), ["AAA"]) sequence = ReceptorSequence(amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="IN")) enc = IdentitySequenceEncoder() self.assertEqual(["AAA"], enc.encode_sequence( sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")))
def test_get_sequence(self): sequence = ReceptorSequence(amino_acid_sequence="CAS", nucleotide_sequence="TGTGCTTCC") EnvironmentSettings.set_sequence_type(SequenceType.AMINO_ACID) self.assertEqual(sequence.get_sequence(), "CAS")
def test_implant(self): strategy = GappedMotifImplanting() motif_instance = MotifInstance("CC/T", 2) sequence = strategy.implant( ReceptorSequence(amino_acid_sequence="AAAAAAAAAA"), { "signal_id": "1", "motif_id": "1", "motif_instance": motif_instance }) self.assertTrue(sequence.get_sequence().find("CCAAT") > -1) self.assertEqual(10, len(sequence.get_sequence())) sequence = strategy.implant( ReceptorSequence(amino_acid_sequence="AAAAAAAAAA"), { "signal_id": "1", "motif_id": "1", "motif_instance": motif_instance }, sequence_position_weights={ 105: 0.8, 106: 0.2 }) self.assertTrue(-1 < sequence.get_sequence().find("CCAAT") < 2) self.assertEqual(10, len(sequence.get_sequence())) motif_instance = MotifInstance("CCT", 0) sequence = strategy.implant( ReceptorSequence(amino_acid_sequence="AAAAAAAAAA"), { "signal_id": "1", "motif_id": "1", "motif_instance": motif_instance }, sequence_position_weights={ 105: 0.8, 106: 0.2 }) self.assertTrue(-1 < sequence.get_sequence().find("CCT") < 2) self.assertEqual(10, len(sequence.get_sequence())) motif_instance = MotifInstance("C/T", 0) sequence = strategy.implant( ReceptorSequence(amino_acid_sequence="AAAAAAAAAA"), { "signal_id": "1", "motif_id": "1", "motif_instance": motif_instance }, sequence_position_weights={ 105: 0.8, 106: 0.2 }) self.assertTrue(-1 < sequence.get_sequence().find("CT") < 2) self.assertTrue("/" not in sequence.get_sequence())
def create_IMGT_kmers_from_sequence(sequence: ReceptorSequence, k: int): positions = PositionHelper.gen_imgt_positions_from_length( len(sequence.get_sequence())) sequence_w_pos = list(zip(list(sequence.get_sequence()), positions)) kmers = KmerHelper.create_kmers_from_string(sequence_w_pos, k) kmers = [(''.join([x[0] for x in kmer]), min([i[1] for i in kmer]) if int(min([i[1] for i in kmer])) != 112 else max([i[1] for i in kmer if int(i[1]) == 112])) for kmer in kmers] return kmers
def matches_sequence(self, original_sequence: ReceptorSequence, reference_sequence: ReceptorSequence, max_distance): """ :param original_sequence: ReceptorSequence :param reference_sequence: ReceptorSequence :param max_distance: max allowed Levenshtein distance between two sequences to be considered a match :return: True if chain, v_gene and j_gene are the same and sequences are within given Levenshtein distance """ return reference_sequence.metadata.chain == original_sequence.metadata.chain \ and self.matches_gene(reference_sequence.metadata.v_gene, original_sequence.metadata.v_gene) \ and self.matches_gene(reference_sequence.metadata.j_gene, original_sequence.metadata.j_gene) \ and edit_distance(original_sequence.get_sequence(), reference_sequence.get_sequence()) <= max_distance
def test_implant_in_sequence(self): implanting = HealthySequenceImplanting( GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND) signal = Signal("1", [Motif("m1", GappedKmerInstantiation(), "CCC")], implanting) sequence = ReceptorSequence(amino_acid_sequence="ACDFQ") sequence2 = implanting.implant_in_sequence(sequence, signal) self.assertEqual(len(sequence.get_sequence()), len(sequence2.get_sequence())) self.assertTrue("CCC" in sequence2.get_sequence())
def construct_test_flatten_dataset(self, path): sequences = [ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1", metadata=SequenceMetadata(custom_params={"l1": 1})), ReceptorSequence(amino_acid_sequence="ATATAT", identifier="2", metadata=SequenceMetadata(custom_params={"l1": 2}))] PathBuilder.build(path) filename = "{}sequences.pkl".format(path) with open(filename, "wb") as file: pickle.dump(sequences, file) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) return SequenceDataset(params={"l1": [1, 2]}, filenames=[filename], identifier="d1")
def create_dummy_receptordataset(self, path): receptors = [ TCABReceptor(identifier="1", alpha=ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1a", metadata=SequenceMetadata( v_gene="TRAV1", j_gene="TRAJ1", chain=Chain.ALPHA, frame_type="IN", custom_params={ "d_call": "TRAD1", "custom1": "cust1" })), beta=ReceptorSequence(amino_acid_sequence="ATATAT", identifier="1b", metadata=SequenceMetadata( v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN", custom_params={ "d_call": "TRBD1", "custom1": "cust1" }))), TCABReceptor(identifier="2", alpha=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2a", metadata=SequenceMetadata( v_gene="TRAV1", j_gene="TRAJ1", chain=Chain.ALPHA, frame_type="IN", custom_params={ "d_call": "TRAD1", "custom2": "cust1" })), beta=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2b", metadata=SequenceMetadata( v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN", custom_params={ "d_call": "TRBD1", "custom2": "cust1" }))) ] return ReceptorDataset.build(receptors, 2, "{}receptors".format(path))
def test_create_sentences_from_repertoire(self): path = EnvironmentSettings.tmp_test_path + "kmer/" PathBuilder.build(path) rep = Repertoire.build_from_sequence_objects([ReceptorSequence(amino_acid_sequence="AACT"), ReceptorSequence(amino_acid_sequence="ACCT"), ReceptorSequence(amino_acid_sequence="AACT")], path, {}) sentences = KmerHelper.create_sentences_from_repertoire(rep, 3) self.assertEqual(3, len(sentences)) self.assertTrue(len(sentences[0]) == 2 and "AAC" in sentences[0] and "ACT" in sentences[0]) shutil.rmtree(path)
def create_gapped_kmers_from_sequence(sequence: ReceptorSequence, k_left: int, max_gap: int, k_right: int = None, min_gap: int = 0): return KmerHelper.create_gapped_kmers_from_string( sequence.get_sequence(), k_left, max_gap, k_right, min_gap)
def test_make_subset(self): sequences = [] for i in range(100): sequences.append( ReceptorSequence(amino_acid_sequence="AAA", identifier=str(i))) path = EnvironmentSettings.tmp_test_path + "element_generator_subset/" PathBuilder.build(path) for i in range(10): with open("{}batch{}.pkl".format(path, i), "wb") as file: sequences_to_pickle = sequences[i * 10:(i + 1) * 10] pickle.dump(sequences_to_pickle, file) d = SequenceDataset( filenames=["{}batch{}.pkl".format(path, i) for i in range(10)], file_size=10) indices = [1, 20, 21, 22, 23, 24, 25, 50, 52, 60, 70, 77, 78, 90, 92] d2 = d.make_subset(indices, path, SequenceDataset.TRAIN) for batch in d2.get_batch(1000): for sequence in batch: self.assertTrue(int(sequence.identifier) in indices) self.assertEqual(15, d2.get_example_count()) shutil.rmtree(path)
def encode_sequence(sequence: ReceptorSequence, params: EncoderParams): """ creates all overlapping gapped k-mers and IMGT position pairs from a sequence as features for use in KmerFrequencyEncoder. this gap length goes from min_gap to max_gap inclusive, and there is a k-mer of length k_left on the left side of the gap and a k-mer of length k_right on the right side of the gap. :param sequence: ReceptorSequence :param params: EncoderParams (within the "model", the following keys are used: "k_left", "k_right", "max_gap", "min_gap") :return: SequenceEncodingResult """ k_left = params.model.get('k_left') k_right = params.model.get('k_right', k_left) max_gap = params.model.get('max_gap') min_gap = params.model.get('min_gap', 0) length = len(sequence.get_sequence()) if length < k_left + k_right + max_gap: warnings.warn( 'Sequence length is less than k_left + k_right + max_gap. Ignoring sequence' ) return None gapped_kmers = KmerHelper.create_IMGT_gapped_kmers_from_sequence( sequence, k_left=k_left, max_gap=max_gap, min_gap=min_gap, k_right=k_right) gapped_kmers = [ Constants.FEATURE_DELIMITER.join([str(mer) for mer in kmer]) for kmer in gapped_kmers ] return gapped_kmers
def test_run(self): path = EnvironmentSettings.root_path + "test/tmp/dataencoder/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects( [ReceptorSequence("AAA", identifier="1")], metadata={ "l1": 1, "l2": 2 }, path=path) rep2 = Repertoire.build_from_sequence_objects( [ReceptorSequence("ATA", identifier="2")], metadata={ "l1": 0, "l2": 3 }, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) encoder = Word2VecEncoder.build_object( dataset, **{ "k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 6 }) res = DataEncoder.run( DataEncoderParams(dataset=dataset, encoder=encoder, encoder_params=EncoderParams( model={}, pool_size=2, label_config=lc, result_path=path, filename="dataset.csv"), store_encoded_data=False)) self.assertTrue(isinstance(res, RepertoireDataset)) self.assertTrue(res.encoded_data.examples.shape[0] == 2) shutil.rmtree(path)
def test_encode_sequence(self): sequence = ReceptorSequence("ABCDEFG", None, None) result = GappedKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(model={"k_left": 3, "max_gap": 1}, label_config=LabelConfiguration(), result_path="")) self.assertEqual({'ABC.EFG', 'ABCDEF', 'BCDEFG'}, set(result)) result = GappedKmerSequenceEncoder.get_feature_names(EncoderParams(model={"k_left": 3, "max_gap": 1}, label_config=LabelConfiguration(), result_path="")) self.assertEqual({'sequence'}, set(result)) self.assertEqual(GappedKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(model={"k_left": 10, "max_gap": 1}, label_config=LabelConfiguration(), result_path="")), None) sequence.amino_acid_sequence = "ABCDEFG" result = GappedKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(model={"k_left": 3, "max_gap": 1}, label_config=LabelConfiguration(), result_path="")) self.assertEqual({'ABC.EFG', 'ABCDEF', 'BCDEFG'}, set(result)) result = GappedKmerSequenceEncoder.get_feature_names(EncoderParams(model={"k_left": 3, "max_gap": 1}, label_config=LabelConfiguration(), result_path="")) self.assertEqual({'sequence'}, set(result)) self.assertEqual(GappedKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(model={"k_left": 10, "max_gap": 1}, label_config=LabelConfiguration(), result_path="")), None) sequence.amino_acid_sequence = "ABCDEFG" result = GappedKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(model={"k_left": 2, "max_gap": 1, "min_gap": 1, "k_right": 3}, label_config=LabelConfiguration(), result_path="")) self.assertEqual({'AB.DEF', 'BC.EFG'}, set(result)) result = GappedKmerSequenceEncoder.get_feature_names(EncoderParams(model={"k_left": 2, "max_gap": 1, "min_gap": 1, "k_right": 3}, label_config=LabelConfiguration(), result_path="")) self.assertEqual({'sequence'}, set(result))
def test_process(self): path = EnvironmentSettings.root_path + "test/tmp/chain_filter/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects([ ReceptorSequence( "AAA", metadata=SequenceMetadata(chain="A"), identifier="1") ], path=path, metadata={}) rep2 = Repertoire.build_from_sequence_objects([ ReceptorSequence( "AAC", metadata=SequenceMetadata(chain="B"), identifier="2") ], path=path, metadata={}) metadata = pd.DataFrame({"CD": [1, 0]}) metadata.to_csv(path + "metadata.csv") dataset = RepertoireDataset(repertoires=[rep1, rep2], metadata_file=path + "metadata.csv") dataset2 = ChainRepertoireFilter.process( dataset, { "keep_chain": "ALPHA", "result_path": path + "results/" }) self.assertEqual(1, len(dataset2.get_data())) self.assertEqual(2, len(dataset.get_data())) metadata_dict = dataset2.get_metadata(["CD"]) self.assertEqual(1, len(metadata_dict["CD"])) self.assertEqual(1, metadata_dict["CD"][0]) for rep in dataset2.get_data(): self.assertEqual("AAA", rep.sequences[0].get_sequence()) self.assertRaises(AssertionError, ChainRepertoireFilter.process, dataset, { "keep_chain": "GAMMA", "result_path": path + "results/" }) shutil.rmtree(path)
def test_create_IMGT_kmers_from_sequence(self): kmers = KmerHelper.create_IMGT_kmers_from_sequence(ReceptorSequence("CASSRYUF"), 3) self.assertTrue(("CAS", 105) in kmers) self.assertTrue(("ASS", 106) in kmers) self.assertTrue(("SSR", 107) in kmers) self.assertTrue(("SRY", 108) in kmers) self.assertTrue(("RYU", 114) in kmers) self.assertTrue(("YUF", 115) in kmers)
def create_dummy_repertoire(self, path): sequence_objects = [ ReceptorSequence(amino_acid_sequence="AAA", nucleotide_sequence="GCTGCTGCT", identifier="receptor_1", metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, count=5, region_type="IMGT_CDR3", frame_type="IN", custom_params={ "d_call": "TRBD1", "custom_test": "cust1" })), ReceptorSequence(amino_acid_sequence="GGG", nucleotide_sequence="GGTGGTGGT", identifier="receptor_2", metadata=SequenceMetadata(v_gene="TRAV2", v_allele="TRAV2*01", j_gene="TRAJ2", chain=Chain.ALPHA, count=15, frame_type=None, region_type="IMGT_CDR3", custom_params={ "d_call": "TRAD2", "custom_test": "cust2" })) ] repertoire = Repertoire.build_from_sequence_objects( sequence_objects=sequence_objects, path=path, metadata={"subject_id": "REP1"}) df = pd.DataFrame({ "filename": [f"{repertoire.identifier}_data.npy"], "subject_id": ["1"], "repertoire_identifier": [repertoire.identifier] }) df.to_csv(path + "metadata.csv", index=False) return repertoire, path + "metadata.csv"
def test_encode(self): test_path = EnvironmentSettings.root_path + "test/tmp/w2v/" PathBuilder.build(test_path) sequence1 = ReceptorSequence("CASSVFA", identifier="1") sequence2 = ReceptorSequence("CASSCCC", identifier="2") metadata1 = {"T1D": "T1D", "subject_id": "1"} rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2], test_path, metadata1) metadata2 = {"T1D": "CTL", "subject_id": "2"} rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path, metadata2) dataset = RepertoireDataset(repertoires=[rep1, rep2]) label_configuration = LabelConfiguration() label_configuration.add_label("T1D", ["T1D", "CTL"]) config_params = EncoderParams(model={}, learn_model=True, result_path=test_path, label_config=label_configuration, filename="dataset.pkl") encoder = Word2VecEncoder.build_object( dataset, **{ "k": 3, "model_type": "sequence", "vector_size": 16 }) encoded_dataset = encoder.encode(dataset=dataset, params=config_params) self.assertIsNotNone(encoded_dataset.encoded_data) self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == 2) self.assertTrue(encoded_dataset.encoded_data.examples.shape[1] == 16) self.assertTrue(len(encoded_dataset.encoded_data.labels["T1D"]) == 2) self.assertTrue(encoded_dataset.encoded_data.labels["T1D"][0] == "T1D") self.assertTrue(isinstance(encoder, W2VRepertoireEncoder)) shutil.rmtree(test_path)
def generate_receptor_dataset(receptor_count: int, chain_1_length_probabilities: dict, chain_2_length_probabilities: dict, labels: dict, path: str): """ Creates receptor_count receptors where the length of sequences in each chain is sampled independently for each sequence from chain_n_length_probabilities distribution. The labels are also randomly assigned to receptors from the distribution given in labels. In this case, labels are multi-class, so each receptor will get one class from each label. This means that negative classes for the labels should be included as well in the specification. chain 1 and 2 in this case refer to alpha and beta chain of a T-cell receptor. An example of input parameters is given below: receptor_count: 100 # generate 100 TRABReceptors chain_1_length_probabilities: 14: 0.8 # 80% of all generated sequences for all receptors (for chain 1) will have length 14 15: 0.2 # 20% of all generated sequences across all receptors (for chain 1) will have length 15 chain_2_length_probabilities: 14: 0.8 # 80% of all generated sequences for all receptors (for chain 2) will have length 14 15: 0.2 # 20% of all generated sequences across all receptors (for chain 2) will have length 15 labels: epitope1: # label name True: 0.5 # 50% of the receptors will have class True False: 0.5 # 50% of the receptors will have class False epitope2: # next label with classes that will be assigned to receptors independently of the previous label or other parameters 1: 0.3 # 30% of the generated receptors will have class 1 0: 0.7 # 70% of the generated receptors will have class 0 """ RandomDatasetGenerator._check_receptor_dataset_generation_params(receptor_count, chain_1_length_probabilities, chain_2_length_probabilities, labels, path) alphabet = EnvironmentSettings.get_sequence_alphabet() PathBuilder.build(path) get_random_sequence = lambda proba, chain, id: ReceptorSequence("".join(random.choices(alphabet, k=random.choices(list(proba.keys()), proba.values())[0])), metadata=SequenceMetadata(count=1, v_subgroup=chain+"V1", v_gene=chain+"V1-1", v_allele=chain+"V1-1*01", j_subgroup=chain + "J1", j_gene=chain + "J1-1", j_allele=chain + "J1-1*01", chain=chain, cell_id=id)) receptors = [TCABReceptor(alpha=get_random_sequence(chain_1_length_probabilities, "TRA", i), beta=get_random_sequence(chain_2_length_probabilities, "TRB", i), metadata={**{label: random.choices(list(label_dict.keys()), label_dict.values(), k=1)[0] for label, label_dict in labels.items()}, **{"subject": f"subj_{i + 1}"}}) for i in range(receptor_count)] filename = f"{path if path[-1] == '/' else path + '/'}batch01.pickle" with open(filename, "wb") as file: pickle.dump(receptors, file) return ReceptorDataset(params={label: list(label_dict.keys()) for label, label_dict in labels.items()}, filenames=[filename], file_size=receptor_count)
def test_run(self): r = [] path = EnvironmentSettings.root_path + "test/tmp/signalImplanter/" if not os.path.isdir(path): os.makedirs(path) sequences = [ReceptorSequence("ACDEFG", identifier="1"), ReceptorSequence("ACDEFG", identifier="2"), ReceptorSequence("ACDEFG", identifier="3"), ReceptorSequence("ACDEFG", identifier="4")] for i in range(10): rep = Repertoire.build_from_sequence_objects(sequence_objects=sequences, path=path, metadata={}) r.append(rep) dataset = RepertoireDataset(repertoires=r) m1 = Motif(identifier="m1", instantiation=GappedKmerInstantiation(), seed="CAS") m2 = Motif(identifier="m2", instantiation=GappedKmerInstantiation(), seed="CCC") s1 = Signal(identifier="s1", motifs=[m1], implanting_strategy=HealthySequenceImplanting(GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND)) s2 = Signal(identifier="s2", motifs=[m1, m2], implanting_strategy=HealthySequenceImplanting(GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND)) simulation = Simulation([Implanting(dataset_implanting_rate=0.2, repertoire_implanting_rate=0.5, signals=[s1, s2], name="i1"), Implanting(dataset_implanting_rate=0.2, repertoire_implanting_rate=0.5, signals=[s2], name="i2")]) input_params = SimulationState(dataset=dataset, result_path=path, simulation=simulation, signals=[s1, s2]) new_dataset = SignalImplanter.run(input_params) reps_with_s2 = sum([rep.metadata[f"signal_{s2.id}"] is True for rep in new_dataset.get_data(batch_size=10)]) reps_with_s1 = sum([rep.metadata[f"signal_{s1.id}"] is True for rep in new_dataset.get_data(batch_size=10)]) self.assertEqual(10, len(new_dataset.get_example_ids())) self.assertTrue(all([f"signal_{s1.id}" in rep.metadata.keys() for rep in new_dataset.get_data(batch_size=10)])) self.assertTrue(all([f"signal_{s2.id}" in rep.metadata.keys() for rep in new_dataset.get_data(batch_size=10)])) self.assertTrue(reps_with_s2 == 4) self.assertTrue(reps_with_s1 == 2) self.assertEqual(10, len(new_dataset.get_example_ids())) metadata_filenames = new_dataset.get_metadata(["filename"])["filename"] self.assertTrue(all([repertoire.data_filename in metadata_filenames for repertoire in new_dataset.repertoires])) shutil.rmtree(path)
def _construct_test_repertoiredataset(self, path, positional): receptors1 = ReceptorSequenceList() receptors2 = ReceptorSequenceList() if positional: [ receptors1.append(seq) for seq in [ ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1"), ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1") ] ] [ receptors2.append(seq) for seq in [ReceptorSequence("TTTTTTTTTTTTT", identifier="1")] ] else: [ receptors1.append(seq) for seq in [ ReceptorSequence("AAAA", identifier="1"), ReceptorSequence("ATA", identifier="2"), ReceptorSequence("ATA", identifier='3') ] ] [ receptors2.append(seq) for seq in [ ReceptorSequence("ATA", identifier="1"), ReceptorSequence("TAA", identifier="2") ] ] rep1 = Repertoire.build_from_sequence_objects(receptors1, metadata={ "l1": 1, "l2": 2, "subject_id": "1" }, path=path) rep2 = Repertoire.build_from_sequence_objects(receptors2, metadata={ "l1": 0, "l2": 3, "subject_id": "2" }, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) return dataset, lc
def match_sequence(self, sequence: ReceptorSequence, reference_sequences: list, max_distance: int) -> dict: matching_sequences = [seq.get_sequence() for seq in reference_sequences if self.matches_sequence(sequence, seq, max_distance)] return { "matching_sequences": matching_sequences, "sequence": sequence.get_sequence(), "v_gene": sequence.metadata.v_gene, "j_gene": sequence.metadata.j_gene, "chain": sequence.metadata.chain }
def get_formatted_node_metadata(self, seq: ReceptorSequence): # sequence, v_gene_subgroup, v_gene, j_gene_subgroup, j_gene chain = seq.get_attribute('chain').value v_gene = seq.get_attribute('v_gene') j_gene = seq.get_attribute('j_gene') additional_info = [] for attr in self.additional_node_attributes: try: additional_info.append(seq.get_attribute(attr)) except KeyError: additional_info.append(None) warnings.warn( f"CytoscapeNetworkExporter: additional metadata attribute {attr} was not found for some receptor chain(s), " f"value None was used instead.") return [ seq.get_sequence(), f"{chain}{v_gene.split('-')[0]}", f"{chain}{v_gene}", f"{chain}{j_gene.split('-')[0]}", f"{chain}{j_gene}" ] + additional_info