def _encode_data(self, dataset: ReceptorDataset, params: EncoderParams): receptor_objs = [receptor for receptor in dataset.get_data()] sequences = [[ getattr(obj, chain).get_sequence(self.sequence_type) for chain in obj.get_chains() ] for obj in receptor_objs] first_chain_seqs, second_chain_seqs = zip(*sequences) if any(seq is None for seq in first_chain_seqs) or any( seq is None for seq in second_chain_seqs): raise ValueError( f"{OneHotEncoder.__name__}: receptor dataset {dataset.name} (id: {dataset.identifier}) contains empty sequences for the " f"specified sequence type {self.sequence_type.name.lower()}. Please check that the dataset is imported correctly." ) max_seq_len = max(max([len(seq) for seq in first_chain_seqs]), max([len(seq) for seq in second_chain_seqs])) example_ids = dataset.get_example_ids() labels = self._get_labels(receptor_objs, params) if params.encode_labels else None examples_first_chain = self._encode_sequence_list( first_chain_seqs, pad_n_sequences=len(receptor_objs), pad_sequence_len=max_seq_len) examples_second_chain = self._encode_sequence_list( second_chain_seqs, pad_n_sequences=len(receptor_objs), pad_sequence_len=max_seq_len) examples = np.stack((examples_first_chain, examples_second_chain), axis=1) feature_names = self._get_feature_names(max_seq_len, receptor_objs[0].get_chains()) if self.flatten: examples = examples.reshape( (len(receptor_objs), 2 * max_seq_len * len(self.onehot_dimensions))) feature_names = [ item for sublist in feature_names for subsublist in sublist for item in subsublist ] encoded_data = EncodedData( examples=examples, labels=labels, example_ids=example_ids, feature_names=feature_names, encoding=OneHotEncoder.__name__, info={ "chain_names": receptor_objs[0].get_chains() if all( receptor_obj.get_chains() == receptor_objs[0].get_chains() for receptor_obj in receptor_objs) else None }) return encoded_data
def _encode_data(self, dataset: ReceptorDataset, params: EncoderParams): receptor_objs = [receptor for receptor in dataset.get_data()] sequences = [[ getattr(obj, chain).get_sequence() for chain in obj.get_chains() ] for obj in receptor_objs] first_chain_seqs, second_chain_seqs = zip(*sequences) max_seq_len = max(max([len(seq) for seq in first_chain_seqs]), max([len(seq) for seq in second_chain_seqs])) example_ids = dataset.get_example_ids() labels = self._get_labels(receptor_objs, params) if params.encode_labels else None examples_first_chain = self._encode_sequence_list( first_chain_seqs, pad_n_sequences=len(receptor_objs), pad_sequence_len=max_seq_len) examples_second_chain = self._encode_sequence_list( second_chain_seqs, pad_n_sequences=len(receptor_objs), pad_sequence_len=max_seq_len) examples = np.stack((examples_first_chain, examples_second_chain), axis=1) feature_names = self._get_feature_names(max_seq_len, receptor_objs[0].get_chains()) if self.flatten: examples = examples.reshape( (len(receptor_objs), 2 * max_seq_len * len(self.onehot_dimensions))) feature_names = [ item for sublist in feature_names for subsublist in sublist for item in subsublist ] encoded_data = EncodedData( examples=examples, labels=labels, example_ids=example_ids, feature_names=feature_names, encoding=OneHotEncoder.__name__, info={ "chain_names": receptor_objs[0].get_chains() if all( receptor_obj.get_chains() == receptor_objs[0].get_chains() for receptor_obj in receptor_objs) else None }) return encoded_data
def _encode_examples(self, dataset: ReceptorDataset, params: EncoderParams): encoded_receptors_counts, encoded_receptors = [], [] receptor_ids = [] label_config = params.label_config labels = {label: [] for label in label_config.get_labels_by_name() } if params.encode_labels else None chains = [] sequence_encoder = self._prepare_sequence_encoder() feature_names = sequence_encoder.get_feature_names(params) for receptor in dataset.get_data(params.pool_size): counts = {chain: Counter() for chain in receptor.get_chains()} chains = receptor.get_chains() for chain in receptor.get_chains(): counts[chain] = self._encode_sequence( receptor.get_chain(chain), params, sequence_encoder, counts[chain]) encoded_receptors_counts.append(counts) receptor_ids.append(receptor.identifier) if params.encode_labels: for label_name in label_config.get_labels_by_name(): label = receptor.metadata[label_name] labels[label_name].append(label) for encoded_receptor_count in encoded_receptors_counts: counts = [ self._add_chain_to_name(encoded_receptor_count[chain], chain) for chain in chains ] encoded_receptors.append(counts[0] + counts[1]) return encoded_receptors, receptor_ids, labels, feature_names
def import_sequence_dataset(import_class, params, dataset_name: str): PathBuilder.build(params.result_path) filenames = ImportHelper.get_sequence_filenames(params.path, dataset_name) file_index = 0 dataset_filenames = [] dataset_params = {} items = None for index, filename in enumerate(filenames): new_items = ImportHelper.import_items(import_class, filename, params) items = np.append(items, new_items) if items is not None else new_items dataset_params = ImportHelper.extract_sequence_dataset_params(items, params) while len(items) > params.sequence_file_size or (index == len(filenames) - 1 and len(items) > 0): dataset_filenames.append(params.result_path / "batch_{}.pickle".format(file_index)) ImportHelper.store_sequence_items(dataset_filenames, items, params.sequence_file_size) items = items[params.sequence_file_size:] file_index += 1 init_kwargs = {"filenames": dataset_filenames, "file_size": params.sequence_file_size, "name": dataset_name, "labels": dataset_params} dataset = ReceptorDataset(**init_kwargs) if params.paired else SequenceDataset(**init_kwargs) PickleExporter.export(dataset, params.result_path) return dataset
def _create_dummy_data(self, path, dataset_type): PathBuilder.build(path) dataset = None test_repertoire = Repertoire.build( sequence_aas=[ "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH" ], v_genes=["V1-1" for i in range(5)], j_genes=["J1-1" for i in range(5)], chains=[ Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA ], custom_lists={ "custom_1": [f"CUST-{i}" for i in range(5)], "custom_2": [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)] }, cell_ids=["1", "1", "1", "2", '2'], path=path) if dataset_type == "receptor": dataset = ReceptorDataset.build_from_objects( test_repertoire.receptors, 100, path, name="receptor_dataset") dataset.identifier = 'receptor_dataset' elif dataset_type == "repertoire": test_repertoire.identifier = "repertoire_dataset" dataset = RepertoireDataset(repertoires=[test_repertoire]) return dataset
def create_dataset(self, path, dataset_size: int = 50): receptors = [] seq1 = ReceptorSequence(amino_acid_sequence="ACACAC") seq2 = ReceptorSequence(amino_acid_sequence="DDDEEE") for i in range(dataset_size): if i % 2 == 0: receptors.append( TCABReceptor(alpha=seq1, beta=seq1, metadata={"l1": 1}, identifier=str(i))) else: receptors.append( TCABReceptor(alpha=seq2, beta=seq2, metadata={"l1": 2}, identifier=str(i))) PathBuilder.build(path) filename = path / "receptors.pkl" with open(filename, "wb") as file: pickle.dump(receptors, file) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) dataset = ReceptorDataset(labels={"l1": [1, 2]}, filenames=[filename], identifier="d1") return dataset
def create_dummy_receptordataset(self, path): receptors = [TCABReceptor(identifier="1", alpha=ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1a", metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1", chain=Chain.ALPHA, frame_type="IN", custom_params={"d_call": "TRAD1", "custom1": "cust1"})), beta=ReceptorSequence(amino_acid_sequence="ATATAT", identifier="1b", metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN", custom_params={"d_call": "TRBD1", "custom1": "cust1"}))), TCABReceptor(identifier="2", alpha=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2a", metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1", chain=Chain.ALPHA, frame_type="IN", custom_params={"d_call": "TRAD1", "custom2": "cust1"})), beta=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2b", metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN", custom_params={"d_call": "TRBD1", "custom2": "cust1"})))] receptors_path = path / "receptors" PathBuilder.build(receptors_path) return ReceptorDataset.build_from_objects(receptors, 2, receptors_path)
def _create_dummy_data(self, path, dataset_type): PathBuilder.build(path) dataset = None test_repertoire = Repertoire.build( sequence_aas=[ "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH" ], v_genes=["V1-1" for i in range(5)], j_genes=["J1-1" for i in range(5)], chains=[ Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA ], custom_lists={ "custom_1": [f"CUST-{i}" for i in range(5)], "custom_2": [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)] }, cell_ids=[1, 1, 1, 2, 2], path=path) if dataset_type == "receptor": receptordataset_filename = path / "receptors.pkl" with open(receptordataset_filename, "wb") as file: pickle.dump(test_repertoire.receptors, file) dataset = ReceptorDataset(filenames=[receptordataset_filename], identifier="receptor_dataset") elif dataset_type == "repertoire": test_repertoire.identifier = "repertoire_dataset" dataset = RepertoireDataset(repertoires=[test_repertoire]) return dataset
def _encode_new_dataset(self, dataset, params: EncoderParams): encoded_data = self._encode_data(dataset, params) encoded_dataset = ReceptorDataset(filenames=dataset.get_filenames(), encoded_data=encoded_data, labels=dataset.labels) return encoded_dataset
def test(self): receptors = [ TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"), beta=ReceptorSequence(amino_acid_sequence="AAACCC"), identifier="1"), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"), beta=ReceptorSequence(amino_acid_sequence="CCC"), identifier="2"), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"), beta=ReceptorSequence(amino_acid_sequence="AAACCC"), identifier="3"), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"), beta=ReceptorSequence(amino_acid_sequence="CCC"), identifier="4") ] path = EnvironmentSettings.tmp_test_path / "kmer_receptor_frequency/" PathBuilder.build(path / 'data') dataset = ReceptorDataset.build_from_objects(receptors, path=path, file_size=10) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) encoder = KmerFreqReceptorEncoder.build_object( dataset, **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "sequence_type": SequenceType.AMINO_ACID.name, "k": 3 }) encoded_dataset = encoder.encode( dataset, EncoderParams(result_path=path / "2/", label_config=lc, pool_size=2, learn_model=True, model={}, filename="dataset.csv", encode_labels=False)) self.assertEqual(4, encoded_dataset.encoded_data.examples.shape[0]) self.assertTrue( all(identifier in encoded_dataset.encoded_data.example_ids for identifier in ['1', '2', '3', '4'])) self.assertTrue( numpy.array_equal(encoded_dataset.encoded_data.examples[0].A, encoded_dataset.encoded_data.examples[2].A)) self.assertTrue( all(feature_name in encoded_dataset.encoded_data.feature_names for feature_name in ["alpha_AAA", "alpha_AAC", "beta_CCC"])) shutil.rmtree(path)
def _build_labels(self, dataset: ReceptorDataset, params: EncoderParams) -> dict: labels = { label: [] for label in params.label_config.get_labels_by_name() } for receptor in dataset.get_data(): for label in labels.keys(): labels[label].append(receptor.metadata[label]) return labels
def _implant_signals_in_receptors( simulation_state: SimulationState) -> Dataset: processed_receptors = SignalImplanter._implant_signals( simulation_state, SignalImplanter._process_receptor) processed_dataset = ReceptorDataset.build( receptors=processed_receptors, file_size=simulation_state.dataset.file_size, name=simulation_state.dataset.name, path=simulation_state.result_path) processed_dataset.labels = {**(simulation_state.dataset.labels if simulation_state.dataset.labels is not None else {}), **{signal: [True, False] for signal in simulation_state.signals}} return processed_dataset
def _construct_test_dataset(self, path, dataset_size: int = 50): receptors = [ TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAA"), beta=ReceptorSequence(amino_acid_sequence="ATA"), metadata={"l1": 1}, identifier=str("1")), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="ATA"), beta=ReceptorSequence(amino_acid_sequence="ATT"), metadata={"l1": 2}, identifier=str("2")) ] PathBuilder.build(path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) dataset = ReceptorDataset.build(receptors, 2, path) return dataset, lc
def _import_from_files(filenames: List[str], generic_params: DatasetImportParams) -> ReceptorDataset: elements = [] for file in filenames: df = pd.read_csv(file, sep=generic_params.separator, usecols=generic_params.columns_to_load) df.dropna() df.drop_duplicates() df.rename(columns=generic_params.column_mapping, inplace=True) if "alpha_amino_acid_sequence" in df: df["alpha_amino_acid_sequence"] = df["alpha_amino_acid_sequence"].str[1:-1] if "beta_amino_acid_sequence" in df: df["beta_amino_acid_sequence"] = df["beta_amino_acid_sequence"].str[1:-1] if "alpha_nucleotide_sequence" in df: df["alpha_nucleotide_sequence"] = df["alpha_nucleotide_sequence"].str[3:-3] if "beta_nucleotide_sequence" in df: df["beta_nucleotide_sequence"] = df["beta_nucleotide_sequence"].str[3:-3] chain_vals = [ch for ch in generic_params.receptor_chains.value] chain_names = [Chain.get_chain(ch).name.lower() for ch in generic_params.receptor_chains.value] for chain_name in chain_names: df = SingleLineReceptorImport.make_gene_columns(df, ["v", "j"], chain_name) for index, row in df.iterrows(): sequences = {chain_vals[i]: ReceptorSequence(amino_acid_sequence=row[ chain_name + "_amino_acid_sequence"] if chain_name + "_amino_acid_sequence" in row else None, nucleotide_sequence=row[ chain_name + "_nucleotide_sequence"] if chain_name + "_nucleotide_sequence" in row else None, metadata=SequenceMetadata( v_gene=row[f"{chain_name}_v_gene"], v_allele=row[f"{chain_name}_v_allele"], v_subgroup=row[f'{chain_name}_v_subgroup'], j_gene=row[f"{chain_name}_j_gene"], j_allele=row[f"{chain_name}_j_allele"], j_subgroup=row[f'{chain_name}_j_subgroup'], chain=chain_name, count=row["count"], region_type=generic_params.region_type.value)) for i, chain_name in enumerate(chain_names)} elements.append(ReceptorBuilder.build_object(sequences, row["identifier"], {key: row[key] for key in row.keys() if all(item not in key for item in ["v_gene", 'j_gene', "count", "identifier"] + chain_names)})) return ReceptorDataset.build(elements, generic_params.sequence_file_size, generic_params.result_path)
def load_sequence_dataset(params: dict, dataset_name: str) -> Dataset: iris_params = IRISImportParams.build_object(**params) filenames = ImportHelper.get_sequence_filenames(iris_params.path, dataset_name) file_index = 0 dataset_filenames = [] for index, filename in enumerate(filenames): items = IRISSequenceImport.import_items(filename, paired=iris_params.paired, all_dual_chains=iris_params.import_dual_chains, all_genes=iris_params.import_all_gene_combinations) while len(items) > iris_params.sequence_file_size or (index == len(filenames) - 1 and len(items) > 0): dataset_filenames.append(iris_params.result_path / "batch_{}.pickle".format(file_index)) ImportHelper.store_sequence_items(dataset_filenames, items, iris_params.sequence_file_size) items = items[iris_params.sequence_file_size:] file_index += 1 return ReceptorDataset(filenames=dataset_filenames, file_size=iris_params.sequence_file_size, name=dataset_name) if iris_params.paired \ else SequenceDataset(filenames=dataset_filenames, file_size=iris_params.sequence_file_size, name=dataset_name)
def test_split_dataset(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "leave_one_out_splitter/") receptors = [] for i in range(10): receptors.append( TCABReceptor(ReceptorSequence(), ReceptorSequence(), {"subject": i % 3})) filename = path / "batch1.pickle" with open(filename, "wb") as file: pickle.dump(receptors, file) dataset = ReceptorDataset(filenames=[filename]) params = DataSplitterParams( dataset, SplitType.LEAVE_ONE_OUT_STRATIFICATION, 3, paths=[path / f"result_{i}/" for i in range(1, 4)], split_config=SplitConfig(SplitType.LEAVE_ONE_OUT_STRATIFICATION, split_count=3, leave_one_out_config=LeaveOneOutConfig( "subject", 1))) train_datasets, test_datasets = LeaveOneOutSplitter.split_dataset( params) self.assertEqual(3, len(train_datasets)) self.assertEqual(3, len(test_datasets)) for i in range(3): self.assertTrue( all(receptor.metadata["subject"] == i for receptor in test_datasets[i].get_data())) self.assertTrue( all(receptor.metadata["subject"] != i for receptor in train_datasets[i].get_data())) shutil.rmtree(path)
def construct_test_flatten_dataset(self, path): receptors = [ TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1a"), beta=ReceptorSequence(amino_acid_sequence="ATATAT", identifier="1b"), metadata={"l1": 1}, identifier="1"), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2a"), beta=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2b"), metadata={"l1": 2}, identifier="2"), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2a"), beta=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2b"), metadata={"l1": 2}, identifier="2") ] return ReceptorDataset.build(receptors, 10, path)
def generate_receptor_dataset(receptor_count: int, chain_1_length_probabilities: dict, chain_2_length_probabilities: dict, labels: dict, path: Path): """ Creates receptor_count receptors where the length of sequences in each chain is sampled independently for each sequence from chain_n_length_probabilities distribution. The labels are also randomly assigned to receptors from the distribution given in labels. In this case, labels are multi-class, so each receptor will get one class from each label. This means that negative classes for the labels should be included as well in the specification. chain 1 and 2 in this case refer to alpha and beta chain of a T-cell receptor. An example of input parameters is given below: receptor_count: 100 # generate 100 TRABReceptors chain_1_length_probabilities: 14: 0.8 # 80% of all generated sequences for all receptors (for chain 1) will have length 14 15: 0.2 # 20% of all generated sequences across all receptors (for chain 1) will have length 15 chain_2_length_probabilities: 14: 0.8 # 80% of all generated sequences for all receptors (for chain 2) will have length 14 15: 0.2 # 20% of all generated sequences across all receptors (for chain 2) will have length 15 labels: epitope1: # label name True: 0.5 # 50% of the receptors will have class True False: 0.5 # 50% of the receptors will have class False epitope2: # next label with classes that will be assigned to receptors independently of the previous label or other parameters 1: 0.3 # 30% of the generated receptors will have class 1 0: 0.7 # 70% of the generated receptors will have class 0 """ RandomDatasetGenerator._check_receptor_dataset_generation_params( receptor_count, chain_1_length_probabilities, chain_2_length_probabilities, labels, path) alphabet = EnvironmentSettings.get_sequence_alphabet() PathBuilder.build(path) get_random_sequence = lambda proba, chain, id: ReceptorSequence( "".join( random.choices(alphabet, k=random.choices(list(proba.keys()), proba.values())[0])), metadata=SequenceMetadata(count=1, v_subgroup=chain + "V1", v_gene=chain + "V1-1", v_allele=chain + "V1-1*01", j_subgroup=chain + "J1", j_gene=chain + "J1-1", j_allele=chain + "J1-1*01", chain=chain, cell_id=id)) receptors = [ TCABReceptor(alpha=get_random_sequence( chain_1_length_probabilities, "TRA", i), beta=get_random_sequence(chain_2_length_probabilities, "TRB", i), metadata={ **{ label: random.choices(list(label_dict.keys()), label_dict.values(), k=1)[0] for label, label_dict in labels.items() }, **{ "subject": f"subj_{i + 1}" } }) for i in range(receptor_count) ] filename = path / "batch01.npy" receptor_matrix = np.core.records.fromrecords( [receptor.get_record() for receptor in receptors], names=TCABReceptor.get_record_names()) np.save(str(filename), receptor_matrix, allow_pickle=False) return ReceptorDataset(labels={ label: list(label_dict.keys()) for label, label_dict in labels.items() }, filenames=[filename], file_size=receptor_count, element_class_name=type(receptors[0]).__name__ if len(receptors) > 0 else None)
def prepare_tcr_dist_dataframe(dataset: ReceptorDataset, label_names: list) -> pd.DataFrame: if len(label_names) > 1: raise NotImplementedError( f"TCRdist: multiple labels specified ({str(label_names)[1:-1]}), but only single label binary class " f"is currently supported in immuneML.") label_name = label_names[0] subject, epitope, count, v_a_gene, j_a_gene, cdr3_a_aa, v_b_gene, j_b_gene, cdr3_b_aa, clone_id, cdr3_b_nucseq, cdr3_a_nucseq = \ [], [], [], [], [], [], [], [], [], [], [], [] for receptor in dataset.get_data(): subject.append(receptor.metadata["subject"] if "subject" in receptor.metadata else "sub" + receptor.identifier) epitope.append(receptor.metadata[label_name]) count.append( receptor.get_chain("alpha").metadata. count if receptor.get_chain("alpha").metadata.count == receptor.get_chain("beta").metadata.count and receptor. get_chain("beta").metadata.count is not None else 1) v_a_gene.append( TCRdistHelper.add_default_allele_to_v_gene( receptor.get_chain('alpha').metadata.v_allele)) j_a_gene.append(receptor.get_chain('alpha').metadata.j_allele) cdr3_a_aa.append(receptor.get_chain('alpha').amino_acid_sequence) cdr3_a_nucseq.append( receptor.get_chain("alpha").nucleotide_sequence) v_b_gene.append( TCRdistHelper.add_default_allele_to_v_gene( receptor.get_chain('beta').metadata.v_allele)) j_b_gene.append(receptor.get_chain('beta').metadata.j_allele) cdr3_b_aa.append(receptor.get_chain('beta').amino_acid_sequence) cdr3_b_nucseq.append( receptor.get_chain("beta").nucleotide_sequence) clone_id.append(receptor.identifier) if all(item is not None for item in cdr3_a_nucseq) and all(item is not None for item in cdr3_b_nucseq): return pd.DataFrame({ "subject": subject, "epitope": epitope, "count": count, "v_a_gene": v_a_gene, "j_a_gene": j_a_gene, "cdr3_a_aa": cdr3_a_aa, "v_b_gene": v_b_gene, "j_b_gene": j_b_gene, "cdr3_b_aa": cdr3_b_aa, "clone_id": clone_id, "cdr3_b_nucseq": cdr3_b_nucseq, "cdr3_a_nucseq": cdr3_a_nucseq }) else: return pd.DataFrame({ "subject": subject, "epitope": epitope, "count": count, "v_a_gene": v_a_gene, "j_a_gene": j_a_gene, "cdr3_a_aa": cdr3_a_aa, "v_b_gene": v_b_gene, "j_b_gene": j_b_gene, "cdr3_b_aa": cdr3_b_aa, "clone_id": clone_id })