def _create_new_sequences(self, sequences, new_sequence_count, signal) -> List[ReceptorSequence]: new_sequences = sequences[:-new_sequence_count] for _ in range(new_sequence_count): motif = random.choice(signal.motifs) motif_instance = motif.instantiate_motif() annotation = SequenceAnnotation([ ImplantAnnotation(signal_id=signal.id, motif_id=motif.identifier, motif_instance=motif_instance.instance, position=0) ]) metadata = SequenceMetadata(v_gene="TRBV6-1", j_gene="TRBJ2-7", count=1, chain="B") new_sequences.append( ReceptorSequence(amino_acid_sequence=motif_instance.instance, annotation=annotation, metadata=metadata)) return new_sequences
def create_gapped_kmers_from_sequence(sequence: ReceptorSequence, k_left: int, max_gap: int, k_right: int = None, min_gap: int = 0): return KmerHelper.create_gapped_kmers_from_string( sequence.get_sequence(), k_left, max_gap, k_right, min_gap)
def test_process(self): path = EnvironmentSettings.root_path / "test/tmp/chain_filter/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects([ ReceptorSequence( "AAA", metadata=SequenceMetadata(chain="A"), identifier="1") ], path=path, metadata={}) rep2 = Repertoire.build_from_sequence_objects([ ReceptorSequence( "AAC", metadata=SequenceMetadata(chain="B"), identifier="2") ], path=path, metadata={}) metadata = pd.DataFrame({"CD": [1, 0]}) metadata.to_csv(path / "metadata.csv") dataset = RepertoireDataset(repertoires=[rep1, rep2], metadata_file=path / "metadata.csv") dataset2 = ChainRepertoireFilter.process( dataset, { "keep_chain": "ALPHA", "result_path": path / "results" }) self.assertEqual(1, len(dataset2.get_data())) self.assertEqual(2, len(dataset.get_data())) metadata_dict = dataset2.get_metadata(["CD"]) self.assertEqual(1, len(metadata_dict["CD"])) self.assertEqual(1, metadata_dict["CD"][0]) for rep in dataset2.get_data(): self.assertEqual("AAA", rep.sequences[0].get_sequence()) self.assertRaises(AssertionError, ChainRepertoireFilter.process, dataset, { "keep_chain": "GAMMA", "result_path": path / "results" }) shutil.rmtree(path)
def test_run(self): path = EnvironmentSettings.root_path / "test/tmp/dataencoder/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects( [ReceptorSequence("AAA", identifier="1")], metadata={ "l1": 1, "l2": 2 }, path=path) rep2 = Repertoire.build_from_sequence_objects( [ReceptorSequence("ATA", identifier="2")], metadata={ "l1": 0, "l2": 3 }, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) encoder = Word2VecEncoder.build_object( dataset, **{ "k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 6 }) res = DataEncoder.run( DataEncoderParams(dataset=dataset, encoder=encoder, encoder_params=EncoderParams( model={}, pool_size=2, label_config=lc, result_path=path, filename="dataset.csv"), store_encoded_data=False)) self.assertTrue(isinstance(res, RepertoireDataset)) self.assertTrue(res.encoded_data.examples.shape[0] == 2) shutil.rmtree(path)
def construct_test_flatten_dataset(self, path): sequences = [ ReceptorSequence( amino_acid_sequence="AAATTT", identifier="1", metadata=SequenceMetadata(custom_params={"l1": 1})), ReceptorSequence( amino_acid_sequence="ATATAT", identifier="2", metadata=SequenceMetadata(custom_params={"l1": 2})) ] PathBuilder.build(path) return SequenceDataset.build(sequences=sequences, file_size=10, path=path)
def test_create_IMGT_kmers_from_sequence(self): kmers = KmerHelper.create_IMGT_kmers_from_sequence(ReceptorSequence("CASSRYUF"), 3, sequence_type=SequenceType.AMINO_ACID) self.assertTrue(("CAS", 105) in kmers) self.assertTrue(("ASS", 106) in kmers) self.assertTrue(("SSR", 107) in kmers) self.assertTrue(("SRY", 108) in kmers) self.assertTrue(("RYU", 114) in kmers) self.assertTrue(("YUF", 115) in kmers)
def create_dummy_repertoire(self, path): sequence_objects = [ ReceptorSequence(amino_acid_sequence="AAA", nucleotide_sequence="GCTGCTGCT", identifier="receptor_1", metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, count=5, region_type="IMGT_CDR3", frame_type="IN", custom_params={ "d_call": "TRBD1", "custom_test": "cust1" })), ReceptorSequence(amino_acid_sequence="GGG", nucleotide_sequence="GGTGGTGGT", identifier="receptor_2", metadata=SequenceMetadata(v_gene="TRAV2", v_allele="TRAV2*01", j_gene="TRAJ2", chain=Chain.ALPHA, count=15, frame_type=None, region_type="IMGT_CDR3", custom_params={ "d_call": "TRAD2", "custom_test": "cust2" })) ] repertoire = Repertoire.build_from_sequence_objects( sequence_objects=sequence_objects, path=path, metadata={"subject_id": "REP1"}) df = pd.DataFrame({ "filename": [f"{repertoire.identifier}_data.npy"], "subject_id": ["1"], "repertoire_identifier": [repertoire.identifier] }) df.to_csv(path / "metadata.csv", index=False) return repertoire, path / "metadata.csv"
def test_encode(self): test_path = EnvironmentSettings.root_path / "test/tmp/w2v/" PathBuilder.build(test_path) sequence1 = ReceptorSequence("CASSVFA", identifier="1") sequence2 = ReceptorSequence("CASSCCC", identifier="2") metadata1 = {"T1D": "T1D", "subject_id": "1"} rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2], test_path, metadata1) metadata2 = {"T1D": "CTL", "subject_id": "2"} rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path, metadata2) dataset = RepertoireDataset(repertoires=[rep1, rep2]) label_configuration = LabelConfiguration() label_configuration.add_label("T1D", ["T1D", "CTL"]) config_params = EncoderParams(model={}, learn_model=True, result_path=test_path, label_config=label_configuration, filename="dataset.pkl") encoder = Word2VecEncoder.build_object( dataset, **{ "k": 3, "model_type": "sequence", "vector_size": 16 }) encoded_dataset = encoder.encode(dataset=dataset, params=config_params) self.assertIsNotNone(encoded_dataset.encoded_data) self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == 2) self.assertTrue(encoded_dataset.encoded_data.examples.shape[1] == 16) self.assertTrue(len(encoded_dataset.encoded_data.labels["T1D"]) == 2) self.assertTrue(encoded_dataset.encoded_data.labels["T1D"][0] == "T1D") self.assertTrue(isinstance(encoder, W2VRepertoireEncoder)) shutil.rmtree(test_path)
def test_create_sentences_from_repertoire(self): path = EnvironmentSettings.tmp_test_path / "kmer/" PathBuilder.build(path) rep = Repertoire.build_from_sequence_objects([ ReceptorSequence(amino_acid_sequence="AACT"), ReceptorSequence(amino_acid_sequence="ACCT"), ReceptorSequence(amino_acid_sequence="AACT") ], path, {}) sentences = KmerHelper.create_sentences_from_repertoire(rep, 3) self.assertEqual(3, len(sentences)) self.assertTrue( len(sentences[0]) == 2 and "AAC" in sentences[0] and "ACT" in sentences[0]) shutil.rmtree(path)
def test(self): receptors = [TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"), beta=ReceptorSequence(amino_acid_sequence="AAACCC"), identifier="1"), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"), beta=ReceptorSequence(amino_acid_sequence="CCC"), identifier="2"), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"), beta=ReceptorSequence(amino_acid_sequence="AAACCC"), identifier="3"), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"), beta=ReceptorSequence(amino_acid_sequence="CCC"), identifier="4")] path = EnvironmentSettings.tmp_test_path / "kmer_receptor_frequency/" PathBuilder.build(path) filename = path / "receptors.pkl" with open(filename, "wb") as file: pickle.dump(receptors, file) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) dataset = ReceptorDataset(labels={"l1": [1, 2]}, filenames=[filename], identifier="d1") encoder = KmerFreqReceptorEncoder.build_object(dataset, **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "sequence_type": SequenceType.AMINO_ACID.name, "k": 3 }) encoded_dataset = encoder.encode(dataset, EncoderParams( result_path=path / "2/", label_config=lc, pool_size=2, learn_model=True, model={}, filename="dataset.csv", encode_labels=False )) self.assertEqual(4, encoded_dataset.encoded_data.examples.shape[0]) self.assertTrue(all(identifier in encoded_dataset.encoded_data.example_ids for identifier in ['1', '2', '3', '4'])) self.assertTrue(numpy.array_equal(encoded_dataset.encoded_data.examples[0].A, encoded_dataset.encoded_data.examples[2].A)) self.assertTrue(all(feature_name in encoded_dataset.encoded_data.feature_names for feature_name in ["alpha_AAA", "alpha_AAC", "beta_CCC"])) shutil.rmtree(path)
def _construct_test_dataset(self, path, dataset_size: int = 50): receptors = [ TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAA"), beta=ReceptorSequence(amino_acid_sequence="ATA"), metadata={"l1": 1}, identifier=str("1")), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="ATA"), beta=ReceptorSequence(amino_acid_sequence="ATT"), metadata={"l1": 2}, identifier=str("2")) ] PathBuilder.build(path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) dataset = ReceptorDataset.build(receptors, 2, path) return dataset, lc
def get_formatted_node_metadata(self, seq: ReceptorSequence): # sequence, v_gene_subgroup, v_gene, j_gene_subgroup, j_gene chain = seq.get_attribute('chain').value v_gene = seq.get_attribute('v_gene') j_gene = seq.get_attribute('j_gene') additional_info = [] for attr in self.additional_node_attributes: try: additional_info.append(seq.get_attribute(attr)) except KeyError: additional_info.append(None) warnings.warn( f"CytoscapeNetworkExporter: additional metadata attribute {attr} was not found for some receptor chain(s), " f"value None was used instead.") return [seq.get_sequence(), f"{chain}{v_gene.split('-')[0]}", f"{chain}{v_gene}", f"{chain}{j_gene.split('-')[0]}", f"{chain}{j_gene}"] + additional_info
def test_create_IMGT_gapped_kmers_from_sequence(self): kmers = KmerHelper.create_IMGT_gapped_kmers_from_sequence( ReceptorSequence("CASSRYUF"), 2, 1, 1, 1) self.assertTrue( all([ k in kmers for k in [('CA.S', 105), ('AS.R', 106), ('SS.Y', 107), ('SR.U', 108), ('RY.F', 114)] ]))
def test_encode_sequence(self): sequence = ReceptorSequence(amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="OUT")) enc = IdentitySequenceEncoder() self.assertEqual(enc.encode_sequence(sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")), ["AAA"]) sequence = ReceptorSequence(amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="STOP")) enc = IdentitySequenceEncoder() self.assertEqual(enc.encode_sequence(sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")), ["AAA"]) sequence = ReceptorSequence(amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="IN")) enc = IdentitySequenceEncoder() self.assertEqual(["AAA"], enc.encode_sequence(sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")))
def _construct_test_repertoiredataset(self, path, positional): receptors1 = ReceptorSequenceList() receptors2 = ReceptorSequenceList() if positional: [receptors1.append(seq) for seq in [ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1"), ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1")]] [receptors2.append(seq) for seq in [ReceptorSequence("TTTTTTTTTTTTT", identifier="1")]] else: [receptors1.append(seq) for seq in [ReceptorSequence("AAAA", identifier="1"), ReceptorSequence("ATA", identifier="2"), ReceptorSequence("ATA", identifier='3')]] [receptors2.append(seq) for seq in [ReceptorSequence("ATA", identifier="1"), ReceptorSequence("TAA", identifier="2")]] rep1 = Repertoire.build_from_sequence_objects(receptors1, metadata={"l1": 1, "l2": 2, "subject_id": "1"}, path=path) rep2 = Repertoire.build_from_sequence_objects(receptors2, metadata={"l1": 0, "l2": 3, "subject_id": "2"}, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) return dataset, lc
def create_from_record(cls, record: np.void): if 'version' in record.dtype.names and record[ 'version'] == TCGDReceptor.version: gamma_record = record[[ 'gamma_' + name for name in ReceptorSequence.get_record_names() ]] gamma_record.dtype.names = ReceptorSequence.get_record_names() delta_record = record[[ 'delta_' + name for name in ReceptorSequence.get_record_names() ]] delta_record.dtype.names = ReceptorSequence.get_record_names() return TCGDReceptor( gamma=ReceptorSequence.create_from_record(gamma_record), delta=ReceptorSequence.create_from_record(delta_record), identifier=record['identifier'], metadata=json.loads(record['metadata'])) else: raise NotImplementedError( f"Supported ({TCGDReceptor.version}) and available version differ, but there is no converter available." )
def create_from_record(cls, record): if 'version' in record.dtype.names and record[ 'version'] == BCKReceptor.version: heavy_record = record[[ 'heavy_' + name for name in ReceptorSequence.get_record_names() ]] heavy_record.dtype.names = ReceptorSequence.get_record_names() kappa_record = record[[ 'kappa_' + name for name in ReceptorSequence.get_record_names() ]] kappa_record.dtype.names = ReceptorSequence.get_record_names() return BCKReceptor( heavy=ReceptorSequence.create_from_record(heavy_record), kappa=ReceptorSequence.create_from_record(kappa_record), identifier=record['identifier'], metadata=json.loads(record['metadata'])) else: raise NotImplementedError( f"Supported ({BCKReceptor.version}) and available version differ, but there is no converter available." )
def test_split_dataset(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "leave_one_out_splitter/") receptors = [] for i in range(10): receptors.append( TCABReceptor(ReceptorSequence(), ReceptorSequence(), {"subject": i % 3})) filename = path / "batch1.pickle" with open(filename, "wb") as file: pickle.dump(receptors, file) dataset = ReceptorDataset(filenames=[filename]) params = DataSplitterParams( dataset, SplitType.LEAVE_ONE_OUT_STRATIFICATION, 3, paths=[path / f"result_{i}/" for i in range(1, 4)], split_config=SplitConfig(SplitType.LEAVE_ONE_OUT_STRATIFICATION, split_count=3, leave_one_out_config=LeaveOneOutConfig( "subject", 1))) train_datasets, test_datasets = LeaveOneOutSplitter.split_dataset( params) self.assertEqual(3, len(train_datasets)) self.assertEqual(3, len(test_datasets)) for i in range(3): self.assertTrue( all(receptor.metadata["subject"] == i for receptor in test_datasets[i].get_data())) self.assertTrue( all(receptor.metadata["subject"] != i for receptor in train_datasets[i].get_data())) shutil.rmtree(path)
def test_encode_sequence(self): seq = ReceptorSequence(amino_acid_sequence="CASSVFRTY") result = KmerSequenceEncoder.encode_sequence(seq, EncoderParams(model={"k": 3}, label_config=LabelConfiguration(), result_path="", pool_size=4)) self.assertTrue("CAS" in result) self.assertTrue("ASS" in result) self.assertTrue("SSV" in result) self.assertTrue("SVF" in result) self.assertTrue("VFR" in result) self.assertTrue("FRT" in result) self.assertTrue("RTY" in result) self.assertEqual(7, len(result)) self.assertEqual( KmerSequenceEncoder.encode_sequence( ReceptorSequence(amino_acid_sequence="AC"), EncoderParams(model={"k": 3}, label_config=LabelConfiguration(), result_path="", pool_size=4) ), None )
def match_sequence(self, sequence: ReceptorSequence, reference_sequences: list, max_distance: int) -> dict: matching_sequences = [ seq.get_sequence() for seq in reference_sequences if self.matches_sequence(sequence, seq, max_distance) ] return { "matching_sequences": matching_sequences, "sequence": sequence.get_sequence(), "v_gene": sequence.metadata.v_gene, "j_gene": sequence.metadata.j_gene, "chain": sequence.metadata.chain }
def test_process(self): path = EnvironmentSettings.root_path / "test/tmp/subject_rep_collector" PathBuilder.build(path) reps = [Repertoire.build_from_sequence_objects([ReceptorSequence("AAA", identifier="1")], path=path, metadata={"subject_id": "patient1"}), Repertoire.build_from_sequence_objects([ReceptorSequence("AAC", identifier="2")], path=path, metadata={"subject_id": "patient1"}), Repertoire.build_from_sequence_objects([ReceptorSequence("AAC", identifier="3")], path=path, metadata={"subject_id": "patient3"})] dataset = RepertoireDataset(repertoires=reps) dataset2 = SubjectRepertoireCollector().process_dataset(dataset, path / "result") self.assertEqual(2, len(dataset2.get_data())) self.assertEqual(3, len(dataset.get_data())) values = [2, 1] for index, rep in enumerate(dataset2.get_data()): self.assertEqual(values[index], len(rep.sequences)) shutil.rmtree(path)
def test_match_repertoire(self): path = EnvironmentSettings.root_path / "test/tmp/seqmatchrep/" PathBuilder.build(path) repertoire = Repertoire.build_from_sequence_objects(sequence_objects=[ ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="1", metadata=SequenceMetadata(chain="A", count=3)), ReceptorSequence(amino_acid_sequence="CCCCCC", identifier="2", metadata=SequenceMetadata(chain="A", count=2)), ReceptorSequence(amino_acid_sequence="AAAACC", identifier="3", metadata=SequenceMetadata(chain="A", count=1)), ReceptorSequence(amino_acid_sequence="TADQVF", identifier="4", metadata=SequenceMetadata(chain="A", count=4)) ], metadata={ "CD": True }, path=path) sequences = [ ReceptorSequence("AAAACA", metadata=SequenceMetadata(chain="A")), ReceptorSequence("TADQV", metadata=SequenceMetadata(chain="A")) ] matcher = SequenceMatcher() result = matcher.match_repertoire(repertoire, 0, sequences, 2, SequenceMatchingSummaryType.COUNT) self.assertTrue("sequences" in result) self.assertTrue("repertoire" in result) self.assertTrue("repertoire_index" in result) self.assertEqual(4, len(result["sequences"])) self.assertEqual(1, len(result["sequences"][0]["matching_sequences"])) self.assertEqual(0, len(result["sequences"][1]["matching_sequences"])) self.assertEqual(1, len(result["sequences"][2]["matching_sequences"])) self.assertEqual(1, len(result["sequences"][3]["matching_sequences"])) self.assertEqual( 3, len([ r for r in result["sequences"] if len(r["matching_sequences"]) > 0 ])) self.assertTrue(result["metadata"]["CD"]) result = matcher.match_repertoire( repertoire, 0, sequences, 2, SequenceMatchingSummaryType.CLONAL_PERCENTAGE) self.assertEqual(0.8, result["clonal_percentage"]) shutil.rmtree(path)
def test_match(self): path = EnvironmentSettings.root_path / "test/tmp/seqmatch/" PathBuilder.build(path) repertoire = Repertoire.build_from_sequence_objects( sequence_objects=[ ReceptorSequence(amino_acid_sequence="AAAAAA", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="3"), ReceptorSequence(amino_acid_sequence="CCCCCC", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="4"), ReceptorSequence(amino_acid_sequence="AAAACC", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="5"), ReceptorSequence(amino_acid_sequence="TADQVF", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J3"), identifier="6") ], metadata={"CD": True}, path=path) dataset = RepertoireDataset(repertoires=[repertoire]) sequences = [ ReceptorSequence("AAAACA", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="1"), ReceptorSequence("TADQV", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J3"), identifier="2") ] matcher = SequenceMatcher() result = matcher.match(dataset, sequences, 2, SequenceMatchingSummaryType.PERCENTAGE) self.assertTrue("repertoires" in result) self.assertEqual( 1, len(result["repertoires"][0]["sequences"][3] ["matching_sequences"])) self.assertTrue(result["repertoires"][0]["metadata"]["CD"]) self.assertEqual(1, len(result["repertoires"])) shutil.rmtree(path)
def test_encode_sequence(self): sequence = ReceptorSequence("AHCDE", None, None) kmers = IMGTGappedKmerEncoder.encode_sequence( sequence, EncoderParams(model={ "k_left": 1, "max_gap": 1 }, label_config=LabelConfiguration(), result_path="")) self.assertEqual( { 'AH///105', 'HC///106', 'CD///107', 'DE///116', 'A.C///105', 'H.D///106', 'C.E///107' }, set(kmers)) sequence = ReceptorSequence("CASSPRERATYEQCAY", None, None) kmers = IMGTGappedKmerEncoder.encode_sequence( sequence, EncoderParams(model={ "k_left": 1, "max_gap": 1 }, label_config=LabelConfiguration(), result_path="")) self.assertEqual( { 'CA///105', 'AS///106', 'SS///107', 'SP///108', 'PR///109', 'RE///110', 'ER///111', 'RA///111.001', 'AT///112.002', 'TY///112.001', 'YE///112', 'EQ///113', 'QC///114', 'CA///115', 'AY///116', 'C.S///105', 'A.S///106', 'S.P///107', 'S.R///108', 'P.E///109', 'R.R///110', 'E.A///111', 'R.T///111.001', 'A.Y///112.002', 'T.E///112.001', 'Y.Q///112', 'E.C///113', 'Q.A///114', 'C.Y///115' }, set(kmers))
def _make_sequence_object(self, row, load_implants: bool = False): fields = row.dtype.names implants = [] if load_implants: keys = [ key for key in row.dtype.names if key not in Repertoire.FIELDS ] for key in keys: value_dict = row[key] if value_dict: try: implants.append( ImplantAnnotation(**ast.literal_eval(value_dict))) except (SyntaxError, ValueError, TypeError) as e: pass seq = ReceptorSequence( amino_acid_sequence=row["sequence_aas"] if "sequence_aas" in fields else None, nucleotide_sequence=row["sequences"] if "sequences" in fields else None, identifier=row["sequence_identifiers"] if "sequence_identifiers" in fields else None, metadata=SequenceMetadata( v_gene=row["v_genes"] if "v_genes" in fields else None, j_gene=row["j_genes"] if "j_genes" in fields else None, v_subgroup=row["v_subgroups"] if "v_subgroups" in fields else None, j_subgroup=row["j_subgroups"] if "j_subgroups" in fields else None, v_allele=row["v_alleles"] if "v_alleles" in fields else None, j_allele=row["j_alleles"] if "j_alleles" in fields else None, chain=row["chains"] if "chains" in fields else None, count=row["counts"] if "counts" in fields and not NumpyHelper.is_nan_or_empty(row['counts']) else None, region_type=row["region_types"] if "region_types" in fields else None, frame_type=row["frame_types"] if "frame_types" in fields else "IN", cell_id=row["cell_ids"] if "cell_ids" in fields else None, custom_params={ key: row[key] if key in fields else None for key in set(self.fields) - set(Repertoire.FIELDS) }), annotation=SequenceAnnotation(implants=implants)) return seq
def process_iris_chain(row, chain, dual_chain_id, all_genes): sequences = ReceptorSequenceList() v_alleles = set([gene.replace("TR{}".format(chain), "").replace(chain, "") for gene in row["TR{} - V gene (1)".format(chain)].split(" | ")]) j_alleles = set([gene.replace("TR{}".format(chain), "").replace(chain, "") for gene in row["TR{} - J gene (1)".format(chain)].split(" | ")]) make_sequence_metadata = lambda v_allele, j_allele, chain, dual_chain_id: \ SequenceMetadata(v_gene=v_allele.split(Constants.ALLELE_DELIMITER)[0], v_allele=v_allele, v_subgroup=v_allele.split("-")[0], j_gene=j_allele.split(Constants.ALLELE_DELIMITER)[0], j_allele=j_allele, j_subgroup=j_allele.split("-")[0], chain=chain, custom_params={"dual_chain_id": dual_chain_id}) if all_genes: for v_allele in v_alleles: for j_allele in j_alleles: metadata = make_sequence_metadata(v_allele, j_allele, chain, dual_chain_id) sequences.append(ReceptorSequence(amino_acid_sequence=row[f"Chain: TR{chain} ({dual_chain_id})"], metadata=metadata)) else: # select a random v and j gene v_allele = v_alleles.pop() j_allele = j_alleles.pop() metadata = make_sequence_metadata(v_allele, j_allele, chain, dual_chain_id) sequences.append(ReceptorSequence(amino_acid_sequence=row[f"Chain: TR{chain} ({dual_chain_id})"], metadata=metadata)) return sequences
def test_encode_sequence(self): sequence = ReceptorSequence("AHCDE", None, None) kmers = IMGTGappedKmerEncoder.encode_sequence( sequence, EncoderParams(model={ "k_left": 1, "max_gap": 1 }, label_config=LabelConfiguration(), result_path="")) self.assertEqual( { 'AH-105', 'HC-106', 'CD-107', 'DE-116', 'A.C-105', 'H.D-106', 'C.E-107' }, set(kmers)) sequence = ReceptorSequence("CASSPRERATYEQCAY", None, None) kmers = IMGTGappedKmerEncoder.encode_sequence( sequence, EncoderParams(model={ "k_left": 1, "max_gap": 1 }, label_config=LabelConfiguration(), result_path="")) self.assertEqual( { 'CA-105', 'AS-106', 'SS-107', 'SP-108', 'PR-109', 'RE-110', 'ER-111', 'RA-111.001', 'AT-112.002', 'TY-112.001', 'YE-112', 'EQ-113', 'QC-114', 'CA-115', 'AY-116', 'C.S-105', 'A.S-106', 'S.P-107', 'S.R-108', 'P.E-109', 'R.R-110', 'E.A-111', 'R.T-111.001', 'A.Y-112.002', 'T.E-112.001', 'Y.Q-112', 'E.C-113', 'Q.A-114', 'C.Y-115' }, set(kmers))
def create_dummy_sequencedataset(self, path): sequences = [ ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1a", metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1", chain=Chain.ALPHA, frame_type="IN", custom_params={ "d_call": "TRAD1", "custom1": "cust1" })), ReceptorSequence(amino_acid_sequence="ATATAT", identifier="1b", metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN", custom_params={ "d_call": "TRBD1", "custom2": "cust1" })), ReceptorSequence(amino_acid_sequence="ATATAT", identifier="2b", metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN", custom_params={ "d_call": "TRBD1", "custom2": "cust1" })) ] sequences_path = path / "sequences" PathBuilder.build(sequences_path) return SequenceDataset.build(sequences, 2, sequences_path)
def build(sequences: list, path: Path, labels: dict = None, seq_metadata: list = None, subject_ids: list = None): if subject_ids is not None: assert len(subject_ids) == len(sequences) if seq_metadata is not None: assert len(sequences) == len(seq_metadata) for index, sequence_list in enumerate(sequences): assert len(sequence_list) == len(seq_metadata[index]) PathBuilder.build(path) rep_path = PathBuilder.build(path / "repertoires") repertoires = [] if subject_ids is None: subject_ids = [] for rep_index, sequence_list in enumerate(sequences): rep_sequences = ReceptorSequenceList() if len(subject_ids) < len(sequences): subject_ids.append("rep_" + str(rep_index)) for seq_index, sequence in enumerate(sequence_list): if seq_metadata is None: m = SequenceMetadata(v_subgroup="TRBV1", v_gene="TRBV1-1", v_allele="TRBV1-1*01", j_subgroup="TRBJ1", j_gene="TRBJ1-1", j_allele="TRBJ1-1*01", count=1, chain="TRB", region_type="IMGT_CDR3") else: m = SequenceMetadata(**seq_metadata[rep_index][seq_index]) s = ReceptorSequence(amino_acid_sequence=sequence, metadata=m, identifier=str(seq_index)) rep_sequences.append(s) if labels is not None: metadata = {key: labels[key][rep_index] for key in labels.keys()} else: metadata = {} metadata = {**metadata, **{"subject_id": subject_ids[rep_index]}} repertoire = Repertoire.build_from_sequence_objects(rep_sequences, rep_path, metadata, filename_base=f"rep_{rep_index}") repertoires.append(repertoire) df = pd.DataFrame({**{"filename": [repertoire.data_filename for repertoire in repertoires], "subject_id": subject_ids, "repertoire_identifier": [repertoire.identifier for repertoire in repertoires]}, **(labels if labels is not None else {})}) df.to_csv(path / "metadata.csv", index=False) return repertoires, path / "metadata.csv"
def encode_sequence(sequence: ReceptorSequence, params: EncoderParams): """ creates overlapping continuous k-mers and IMGT position pairs from a sequence as features for use in KmerFrequencyEncoder object of type EncoderParams, same object as passed into KmerFrequencyEncoder. :param sequence: ReceptorSequence :param params: EncoderParams (where params["model"]["k"] is used) :return: SequenceEncodingResult """ k = params.model["k"] length = len(sequence.get_sequence()) if length < k: logging.warning('KmerSequenceEncoder: Sequence length is less than k. Ignoring sequence...') return None kmers = KmerHelper.create_kmers_from_sequence(sequence, k) return kmers