def create_dummy_receptordataset(self, path): receptors = [TCABReceptor(identifier="1", alpha=ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1a", metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1", chain=Chain.ALPHA, frame_type="IN", custom_params={"d_call": "TRAD1", "custom1": "cust1"})), beta=ReceptorSequence(amino_acid_sequence="ATATAT", identifier="1b", metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN", custom_params={"d_call": "TRBD1", "custom1": "cust1"}))), TCABReceptor(identifier="2", alpha=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2a", metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1", chain=Chain.ALPHA, frame_type="IN", custom_params={"d_call": "TRAD1", "custom2": "cust1"})), beta=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2b", metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN", custom_params={"d_call": "TRBD1", "custom2": "cust1"})))] receptors_path = path / "receptors" PathBuilder.build(receptors_path) return ReceptorDataset.build_from_objects(receptors, 2, receptors_path)
def create_dataset(self, path, dataset_size: int = 50): receptors = [] seq1 = ReceptorSequence(amino_acid_sequence="ACACAC") seq2 = ReceptorSequence(amino_acid_sequence="DDDEEE") for i in range(dataset_size): if i % 2 == 0: receptors.append( TCABReceptor(alpha=seq1, beta=seq1, metadata={"l1": 1}, identifier=str(i))) else: receptors.append( TCABReceptor(alpha=seq2, beta=seq2, metadata={"l1": 2}, identifier=str(i))) PathBuilder.build(path) filename = path / "receptors.pkl" with open(filename, "wb") as file: pickle.dump(receptors, file) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) dataset = ReceptorDataset(labels={"l1": [1, 2]}, filenames=[filename], identifier="d1") return dataset
def test(self): receptors = [ TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"), beta=ReceptorSequence(amino_acid_sequence="AAACCC"), identifier="1"), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"), beta=ReceptorSequence(amino_acid_sequence="CCC"), identifier="2"), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"), beta=ReceptorSequence(amino_acid_sequence="AAACCC"), identifier="3"), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"), beta=ReceptorSequence(amino_acid_sequence="CCC"), identifier="4") ] path = EnvironmentSettings.tmp_test_path / "kmer_receptor_frequency/" PathBuilder.build(path / 'data') dataset = ReceptorDataset.build_from_objects(receptors, path=path, file_size=10) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) encoder = KmerFreqReceptorEncoder.build_object( dataset, **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "sequence_type": SequenceType.AMINO_ACID.name, "k": 3 }) encoded_dataset = encoder.encode( dataset, EncoderParams(result_path=path / "2/", label_config=lc, pool_size=2, learn_model=True, model={}, filename="dataset.csv", encode_labels=False)) self.assertEqual(4, encoded_dataset.encoded_data.examples.shape[0]) self.assertTrue( all(identifier in encoded_dataset.encoded_data.example_ids for identifier in ['1', '2', '3', '4'])) self.assertTrue( numpy.array_equal(encoded_dataset.encoded_data.examples[0].A, encoded_dataset.encoded_data.examples[2].A)) self.assertTrue( all(feature_name in encoded_dataset.encoded_data.feature_names for feature_name in ["alpha_AAA", "alpha_AAC", "beta_CCC"])) shutil.rmtree(path)
def build_receptor_from_rows(first_row, second_row, identifier, chain_pair, metadata_columns): first_sequence = ImportHelper.import_sequence(first_row, metadata_columns=metadata_columns) second_sequence = ImportHelper.import_sequence(second_row, metadata_columns=metadata_columns) if chain_pair == ChainPair.TRA_TRB: receptor = TCABReceptor(alpha=first_sequence, beta=second_sequence, identifier=identifier, metadata={**second_sequence.metadata.custom_params}) elif chain_pair == ChainPair.TRG_TRD: receptor = TCGDReceptor(gamma=first_sequence, delta=second_sequence, identifier=identifier, metadata={**second_sequence.metadata.custom_params}) elif chain_pair == ChainPair.IGH_IGL: receptor = BCReceptor(heavy=first_sequence, light=second_sequence, identifier=identifier, metadata={**first_sequence.metadata.custom_params}) elif chain_pair == ChainPair.IGH_IGK: receptor = BCKReceptor(heavy=first_sequence, kappa=second_sequence, identifier=identifier, metadata={**first_sequence.metadata.custom_params}) else: raise NotImplementedError(f"ImportHelper: {chain_pair} chain pair is not supported.") return receptor
def _prepare_parameters(reference: dict, max_edit_distances: dict, name: str = None): location = "MatchedReceptorsEncoder" legal_chains = [ chain for receptor in (TCABReceptor(), TCGDReceptor(), BCReceptor()) for chain in receptor.get_chains() ] if type(max_edit_distances) is int: max_edit_distances = { chain: max_edit_distances for chain in legal_chains } elif type(max_edit_distances) is dict: ParameterValidator.assert_keys(max_edit_distances.keys(), legal_chains, location, "max_edit_distances", exclusive=False) else: ParameterValidator.assert_type_and_value(max_edit_distances, dict, location, 'max_edit_distances') reference_receptors = MatchedReferenceUtil.prepare_reference( reference, location=location, paired=True) return { "reference_receptors": reference_receptors, "max_edit_distances": max_edit_distances, "name": name }
def build_object(cls, sequences: dict, identifier: str = None, metadata: dict = None) -> Receptor: if all(chain in ChainPair.TRA_TRB.value for chain in sequences.keys()): return TCABReceptor(alpha=sequences[Chain.ALPHA.value], beta=sequences[Chain.BETA.value], identifier=identifier, metadata=metadata) elif all(chain in ChainPair.TRG_TRD.value for chain in sequences.keys()): return TCGDReceptor(gamma=sequences[Chain.GAMMA.value], delta=sequences[Chain.DELTA.value], identifier=identifier, metadata=metadata) elif all(chain in ChainPair.IGH_IGL.value for chain in sequences.keys()): return BCReceptor(heavy=sequences[Chain.HEAVY.value], light=sequences[Chain.LIGHT.value], identifier=identifier, metadata=metadata) elif all(chain in ChainPair.IGH_IGK.value for chain in sequences.keys()): return BCKReceptor(heavy=sequences[Chain.HEAVY.value], kappa=sequences[Chain.KAPPA.value], identifier=identifier, metadata=metadata) else: warnings.warn( f"ReceptorBuilder: attempt to build_from_objects receptor with chains {sequences.keys()}, returning None..." ) return None
def process_iris_row(row, paired: bool = False, all_dual_chains: bool = True, all_genes: bool = False): if paired: sequences = [] if row["Chain: TRA (1)"] is not None and row["Chain: TRB (1)"] is not None: alpha_seqs = IRISSequenceImport.process_iris_chain(row, "A", 1, all_genes) beta_seqs = IRISSequenceImport.process_iris_chain(row, "B", 1, all_genes) if all_dual_chains: if row["Chain: TRA (2)"] is not None: alpha_seqs.extend(IRISSequenceImport.process_iris_chain(row, "A", 2, all_genes)) if row["Chain: TRB (2)"] is not None: beta_seqs.extend(IRISSequenceImport.process_iris_chain(row, "B", 2, all_genes)) for alpha_i, alpha_seq in enumerate(alpha_seqs): for beta_i, beta_seq in enumerate(beta_seqs): clonotype_id = row["Clonotype ID"] identifier = f"{clonotype_id}-A{alpha_i}-B{beta_i}" sequences.extend([TCABReceptor(alpha=alpha_seq, beta=beta_seq, identifier=identifier, metadata={"clonotype_id": clonotype_id})]) else: sequences = ReceptorSequenceList() # process all dual chains if specified, otherwise just chain 1 to_process = list(it.product(["A", "B"], [1, 2] if all_dual_chains else [1])) for chain, dual_chain_id in to_process: if row[f"Chain: TR{chain} ({dual_chain_id})"] is not None: sequences.extend(IRISSequenceImport.process_iris_chain(row, chain, dual_chain_id, all_genes)) return sequences
def _construct_test_dataset(self, path, dataset_size: int = 50): receptors = [ TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAA"), beta=ReceptorSequence(amino_acid_sequence="ATA"), metadata={"l1": 1}, identifier=str("1")), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="ATA"), beta=ReceptorSequence(amino_acid_sequence="ATT"), metadata={"l1": 2}, identifier=str("2")) ] PathBuilder.build(path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) dataset = ReceptorDataset.build(receptors, 2, path) return dataset, lc
def construct_test_flatten_dataset(self, path): receptors = [ TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1a"), beta=ReceptorSequence(amino_acid_sequence="ATATAT", identifier="1b"), metadata={"l1": 1}, identifier="1"), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2a"), beta=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2b"), metadata={"l1": 2}, identifier="2"), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2a"), beta=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2b"), metadata={"l1": 2}, identifier="2") ] return ReceptorDataset.build(receptors, 10, path)
def test_split_dataset(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "leave_one_out_splitter/") receptors = [] for i in range(10): receptors.append( TCABReceptor(ReceptorSequence(), ReceptorSequence(), {"subject": i % 3})) filename = path / "batch1.pickle" with open(filename, "wb") as file: pickle.dump(receptors, file) dataset = ReceptorDataset(filenames=[filename]) params = DataSplitterParams( dataset, SplitType.LEAVE_ONE_OUT_STRATIFICATION, 3, paths=[path / f"result_{i}/" for i in range(1, 4)], split_config=SplitConfig(SplitType.LEAVE_ONE_OUT_STRATIFICATION, split_count=3, leave_one_out_config=LeaveOneOutConfig( "subject", 1))) train_datasets, test_datasets = LeaveOneOutSplitter.split_dataset( params) self.assertEqual(3, len(train_datasets)) self.assertEqual(3, len(test_datasets)) for i in range(3): self.assertTrue( all(receptor.metadata["subject"] == i for receptor in test_datasets[i].get_data())) self.assertTrue( all(receptor.metadata["subject"] != i for receptor in train_datasets[i].get_data())) shutil.rmtree(path)
def generate_receptor_dataset(receptor_count: int, chain_1_length_probabilities: dict, chain_2_length_probabilities: dict, labels: dict, path: Path): """ Creates receptor_count receptors where the length of sequences in each chain is sampled independently for each sequence from chain_n_length_probabilities distribution. The labels are also randomly assigned to receptors from the distribution given in labels. In this case, labels are multi-class, so each receptor will get one class from each label. This means that negative classes for the labels should be included as well in the specification. chain 1 and 2 in this case refer to alpha and beta chain of a T-cell receptor. An example of input parameters is given below: receptor_count: 100 # generate 100 TRABReceptors chain_1_length_probabilities: 14: 0.8 # 80% of all generated sequences for all receptors (for chain 1) will have length 14 15: 0.2 # 20% of all generated sequences across all receptors (for chain 1) will have length 15 chain_2_length_probabilities: 14: 0.8 # 80% of all generated sequences for all receptors (for chain 2) will have length 14 15: 0.2 # 20% of all generated sequences across all receptors (for chain 2) will have length 15 labels: epitope1: # label name True: 0.5 # 50% of the receptors will have class True False: 0.5 # 50% of the receptors will have class False epitope2: # next label with classes that will be assigned to receptors independently of the previous label or other parameters 1: 0.3 # 30% of the generated receptors will have class 1 0: 0.7 # 70% of the generated receptors will have class 0 """ RandomDatasetGenerator._check_receptor_dataset_generation_params( receptor_count, chain_1_length_probabilities, chain_2_length_probabilities, labels, path) alphabet = EnvironmentSettings.get_sequence_alphabet() PathBuilder.build(path) get_random_sequence = lambda proba, chain, id: ReceptorSequence( "".join( random.choices(alphabet, k=random.choices(list(proba.keys()), proba.values())[0])), metadata=SequenceMetadata(count=1, v_subgroup=chain + "V1", v_gene=chain + "V1-1", v_allele=chain + "V1-1*01", j_subgroup=chain + "J1", j_gene=chain + "J1-1", j_allele=chain + "J1-1*01", chain=chain, cell_id=id)) receptors = [ TCABReceptor(alpha=get_random_sequence( chain_1_length_probabilities, "TRA", i), beta=get_random_sequence(chain_2_length_probabilities, "TRB", i), metadata={ **{ label: random.choices(list(label_dict.keys()), label_dict.values(), k=1)[0] for label, label_dict in labels.items() }, **{ "subject": f"subj_{i + 1}" } }) for i in range(receptor_count) ] filename = path / "batch01.npy" receptor_matrix = np.core.records.fromrecords( [receptor.get_record() for receptor in receptors], names=TCABReceptor.get_record_names()) np.save(str(filename), receptor_matrix, allow_pickle=False) return ReceptorDataset(labels={ label: list(label_dict.keys()) for label, label_dict in labels.items() }, filenames=[filename], file_size=receptor_count, element_class_name=type(receptors[0]).__name__ if len(receptors) > 0 else None)