예제 #1
0
    def create_dummy_receptordataset(self, path):
        receptors = [TCABReceptor(identifier="1",
                                  alpha=ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1a",
                                                         metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1",
                                                                                   chain=Chain.ALPHA,
                                                                                   frame_type="IN",
                                                                                   custom_params={"d_call": "TRAD1",
                                                                                                  "custom1": "cust1"})),
                                  beta=ReceptorSequence(amino_acid_sequence="ATATAT", identifier="1b",
                                                        metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1",
                                                                                  chain=Chain.BETA,
                                                                                  frame_type="IN",
                                                                                  custom_params={"d_call": "TRBD1",
                                                                                                 "custom1": "cust1"}))),
                     TCABReceptor(identifier="2",
                                  alpha=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2a",
                                                         metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1",
                                                                                   chain=Chain.ALPHA,
                                                                                   frame_type="IN",
                                                                                   custom_params={"d_call": "TRAD1",
                                                                                                  "custom2": "cust1"})),
                                  beta=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2b",
                                                        metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1",
                                                                                  chain=Chain.BETA,
                                                                                  frame_type="IN",
                                                                                  custom_params={"d_call": "TRBD1",
                                                                                                 "custom2": "cust1"})))]

        receptors_path = path / "receptors"
        PathBuilder.build(receptors_path)
        return ReceptorDataset.build_from_objects(receptors, 2, receptors_path)
    def create_dataset(self, path, dataset_size: int = 50):

        receptors = []

        seq1 = ReceptorSequence(amino_acid_sequence="ACACAC")
        seq2 = ReceptorSequence(amino_acid_sequence="DDDEEE")

        for i in range(dataset_size):
            if i % 2 == 0:
                receptors.append(
                    TCABReceptor(alpha=seq1,
                                 beta=seq1,
                                 metadata={"l1": 1},
                                 identifier=str(i)))
            else:
                receptors.append(
                    TCABReceptor(alpha=seq2,
                                 beta=seq2,
                                 metadata={"l1": 2},
                                 identifier=str(i)))

        PathBuilder.build(path)
        filename = path / "receptors.pkl"
        with open(filename, "wb") as file:
            pickle.dump(receptors, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = ReceptorDataset(labels={"l1": [1, 2]},
                                  filenames=[filename],
                                  identifier="d1")
        return dataset
    def test(self):

        receptors = [
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"),
                         beta=ReceptorSequence(amino_acid_sequence="AAACCC"),
                         identifier="1"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"),
                         beta=ReceptorSequence(amino_acid_sequence="CCC"),
                         identifier="2"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"),
                         beta=ReceptorSequence(amino_acid_sequence="AAACCC"),
                         identifier="3"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"),
                         beta=ReceptorSequence(amino_acid_sequence="CCC"),
                         identifier="4")
        ]

        path = EnvironmentSettings.tmp_test_path / "kmer_receptor_frequency/"
        PathBuilder.build(path / 'data')
        dataset = ReceptorDataset.build_from_objects(receptors,
                                                     path=path,
                                                     file_size=10)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        encoder = KmerFreqReceptorEncoder.build_object(
            dataset, **{
                "normalization_type":
                NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
                "sequence_type": SequenceType.AMINO_ACID.name,
                "k": 3
            })

        encoded_dataset = encoder.encode(
            dataset,
            EncoderParams(result_path=path / "2/",
                          label_config=lc,
                          pool_size=2,
                          learn_model=True,
                          model={},
                          filename="dataset.csv",
                          encode_labels=False))

        self.assertEqual(4, encoded_dataset.encoded_data.examples.shape[0])
        self.assertTrue(
            all(identifier in encoded_dataset.encoded_data.example_ids
                for identifier in ['1', '2', '3', '4']))
        self.assertTrue(
            numpy.array_equal(encoded_dataset.encoded_data.examples[0].A,
                              encoded_dataset.encoded_data.examples[2].A))
        self.assertTrue(
            all(feature_name in encoded_dataset.encoded_data.feature_names
                for feature_name in ["alpha_AAA", "alpha_AAC", "beta_CCC"]))

        shutil.rmtree(path)
예제 #4
0
    def build_receptor_from_rows(first_row, second_row, identifier, chain_pair, metadata_columns):
        first_sequence = ImportHelper.import_sequence(first_row, metadata_columns=metadata_columns)
        second_sequence = ImportHelper.import_sequence(second_row, metadata_columns=metadata_columns)

        if chain_pair == ChainPair.TRA_TRB:
            receptor = TCABReceptor(alpha=first_sequence,
                                    beta=second_sequence,
                                    identifier=identifier,
                                    metadata={**second_sequence.metadata.custom_params})
        elif chain_pair == ChainPair.TRG_TRD:
            receptor = TCGDReceptor(gamma=first_sequence,
                                    delta=second_sequence,
                                    identifier=identifier,
                                    metadata={**second_sequence.metadata.custom_params})
        elif chain_pair == ChainPair.IGH_IGL:
            receptor = BCReceptor(heavy=first_sequence,
                                  light=second_sequence,
                                  identifier=identifier,
                                  metadata={**first_sequence.metadata.custom_params})
        elif chain_pair == ChainPair.IGH_IGK:
            receptor = BCKReceptor(heavy=first_sequence,
                                   kappa=second_sequence,
                                   identifier=identifier,
                                   metadata={**first_sequence.metadata.custom_params})
        else:
            raise NotImplementedError(f"ImportHelper: {chain_pair} chain pair is not supported.")

        return receptor
예제 #5
0
    def _prepare_parameters(reference: dict,
                            max_edit_distances: dict,
                            name: str = None):
        location = "MatchedReceptorsEncoder"

        legal_chains = [
            chain
            for receptor in (TCABReceptor(), TCGDReceptor(), BCReceptor())
            for chain in receptor.get_chains()
        ]

        if type(max_edit_distances) is int:
            max_edit_distances = {
                chain: max_edit_distances
                for chain in legal_chains
            }
        elif type(max_edit_distances) is dict:
            ParameterValidator.assert_keys(max_edit_distances.keys(),
                                           legal_chains,
                                           location,
                                           "max_edit_distances",
                                           exclusive=False)
        else:
            ParameterValidator.assert_type_and_value(max_edit_distances, dict,
                                                     location,
                                                     'max_edit_distances')

        reference_receptors = MatchedReferenceUtil.prepare_reference(
            reference, location=location, paired=True)

        return {
            "reference_receptors": reference_receptors,
            "max_edit_distances": max_edit_distances,
            "name": name
        }
예제 #6
0
 def build_object(cls,
                  sequences: dict,
                  identifier: str = None,
                  metadata: dict = None) -> Receptor:
     if all(chain in ChainPair.TRA_TRB.value for chain in sequences.keys()):
         return TCABReceptor(alpha=sequences[Chain.ALPHA.value],
                             beta=sequences[Chain.BETA.value],
                             identifier=identifier,
                             metadata=metadata)
     elif all(chain in ChainPair.TRG_TRD.value
              for chain in sequences.keys()):
         return TCGDReceptor(gamma=sequences[Chain.GAMMA.value],
                             delta=sequences[Chain.DELTA.value],
                             identifier=identifier,
                             metadata=metadata)
     elif all(chain in ChainPair.IGH_IGL.value
              for chain in sequences.keys()):
         return BCReceptor(heavy=sequences[Chain.HEAVY.value],
                           light=sequences[Chain.LIGHT.value],
                           identifier=identifier,
                           metadata=metadata)
     elif all(chain in ChainPair.IGH_IGK.value
              for chain in sequences.keys()):
         return BCKReceptor(heavy=sequences[Chain.HEAVY.value],
                            kappa=sequences[Chain.KAPPA.value],
                            identifier=identifier,
                            metadata=metadata)
     else:
         warnings.warn(
             f"ReceptorBuilder: attempt to build_from_objects receptor with chains {sequences.keys()}, returning None..."
         )
         return None
예제 #7
0
    def process_iris_row(row, paired: bool = False, all_dual_chains: bool = True, all_genes: bool = False):
        if paired:
            sequences = []
            if row["Chain: TRA (1)"] is not None and row["Chain: TRB (1)"] is not None:
                alpha_seqs = IRISSequenceImport.process_iris_chain(row, "A", 1, all_genes)
                beta_seqs = IRISSequenceImport.process_iris_chain(row, "B", 1, all_genes)

                if all_dual_chains:
                    if row["Chain: TRA (2)"] is not None:
                        alpha_seqs.extend(IRISSequenceImport.process_iris_chain(row, "A", 2, all_genes))
                    if row["Chain: TRB (2)"] is not None:
                        beta_seqs.extend(IRISSequenceImport.process_iris_chain(row, "B", 2, all_genes))

                for alpha_i, alpha_seq in enumerate(alpha_seqs):
                    for beta_i, beta_seq in enumerate(beta_seqs):
                        clonotype_id = row["Clonotype ID"]
                        identifier = f"{clonotype_id}-A{alpha_i}-B{beta_i}"
                        sequences.extend([TCABReceptor(alpha=alpha_seq,
                                                       beta=beta_seq,
                                                       identifier=identifier,
                                                       metadata={"clonotype_id": clonotype_id})])
        else:
            sequences = ReceptorSequenceList()
            # process all dual chains if specified, otherwise just chain 1
            to_process = list(it.product(["A", "B"], [1, 2] if all_dual_chains else [1]))

            for chain, dual_chain_id in to_process:
                if row[f"Chain: TR{chain} ({dual_chain_id})"] is not None:
                    sequences.extend(IRISSequenceImport.process_iris_chain(row, chain, dual_chain_id, all_genes))

        return sequences
    def _construct_test_dataset(self, path, dataset_size: int = 50):
        receptors = [
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAA"),
                         beta=ReceptorSequence(amino_acid_sequence="ATA"),
                         metadata={"l1": 1},
                         identifier=str("1")),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="ATA"),
                         beta=ReceptorSequence(amino_acid_sequence="ATT"),
                         metadata={"l1": 2},
                         identifier=str("2"))
        ]

        PathBuilder.build(path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = ReceptorDataset.build(receptors, 2, path)
        return dataset, lc
    def construct_test_flatten_dataset(self, path):
        receptors = [
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAATTT",
                                                identifier="1a"),
                         beta=ReceptorSequence(amino_acid_sequence="ATATAT",
                                               identifier="1b"),
                         metadata={"l1": 1},
                         identifier="1"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                                identifier="2a"),
                         beta=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                               identifier="2b"),
                         metadata={"l1": 2},
                         identifier="2"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                                identifier="2a"),
                         beta=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                               identifier="2b"),
                         metadata={"l1": 2},
                         identifier="2")
        ]

        return ReceptorDataset.build(receptors, 10, path)
예제 #10
0
    def test_split_dataset(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "leave_one_out_splitter/")
        receptors = []
        for i in range(10):
            receptors.append(
                TCABReceptor(ReceptorSequence(), ReceptorSequence(),
                             {"subject": i % 3}))

        filename = path / "batch1.pickle"
        with open(filename, "wb") as file:
            pickle.dump(receptors, file)

        dataset = ReceptorDataset(filenames=[filename])

        params = DataSplitterParams(
            dataset,
            SplitType.LEAVE_ONE_OUT_STRATIFICATION,
            3,
            paths=[path / f"result_{i}/" for i in range(1, 4)],
            split_config=SplitConfig(SplitType.LEAVE_ONE_OUT_STRATIFICATION,
                                     split_count=3,
                                     leave_one_out_config=LeaveOneOutConfig(
                                         "subject", 1)))
        train_datasets, test_datasets = LeaveOneOutSplitter.split_dataset(
            params)

        self.assertEqual(3, len(train_datasets))
        self.assertEqual(3, len(test_datasets))

        for i in range(3):
            self.assertTrue(
                all(receptor.metadata["subject"] == i
                    for receptor in test_datasets[i].get_data()))
            self.assertTrue(
                all(receptor.metadata["subject"] != i
                    for receptor in train_datasets[i].get_data()))

        shutil.rmtree(path)
예제 #11
0
    def generate_receptor_dataset(receptor_count: int,
                                  chain_1_length_probabilities: dict,
                                  chain_2_length_probabilities: dict,
                                  labels: dict, path: Path):
        """
        Creates receptor_count receptors where the length of sequences in each chain is sampled independently for each sequence from
        chain_n_length_probabilities distribution. The labels are also randomly assigned to receptors from the distribution given in
        labels. In this case, labels are multi-class, so each receptor will get one class from each label. This means that negative
        classes for the labels should be included as well in the specification. chain 1 and 2 in this case refer to alpha and beta
        chain of a T-cell receptor.

        An example of input parameters is given below:

        receptor_count: 100 # generate 100 TRABReceptors
        chain_1_length_probabilities:
            14: 0.8 # 80% of all generated sequences for all receptors (for chain 1) will have length 14
            15: 0.2 # 20% of all generated sequences across all receptors (for chain 1) will have length 15
        chain_2_length_probabilities:
            14: 0.8 # 80% of all generated sequences for all receptors (for chain 2) will have length 14
            15: 0.2 # 20% of all generated sequences across all receptors (for chain 2) will have length 15
        labels:
            epitope1: # label name
                True: 0.5 # 50% of the receptors will have class True
                False: 0.5 # 50% of the receptors will have class False
            epitope2: # next label with classes that will be assigned to receptors independently of the previous label or other parameters
                1: 0.3 # 30% of the generated receptors will have class 1
                0: 0.7 # 70% of the generated receptors will have class 0
        """
        RandomDatasetGenerator._check_receptor_dataset_generation_params(
            receptor_count, chain_1_length_probabilities,
            chain_2_length_probabilities, labels, path)

        alphabet = EnvironmentSettings.get_sequence_alphabet()
        PathBuilder.build(path)

        get_random_sequence = lambda proba, chain, id: ReceptorSequence(
            "".join(
                random.choices(alphabet,
                               k=random.choices(list(proba.keys()),
                                                proba.values())[0])),
            metadata=SequenceMetadata(count=1,
                                      v_subgroup=chain + "V1",
                                      v_gene=chain + "V1-1",
                                      v_allele=chain + "V1-1*01",
                                      j_subgroup=chain + "J1",
                                      j_gene=chain + "J1-1",
                                      j_allele=chain + "J1-1*01",
                                      chain=chain,
                                      cell_id=id))

        receptors = [
            TCABReceptor(alpha=get_random_sequence(
                chain_1_length_probabilities, "TRA", i),
                         beta=get_random_sequence(chain_2_length_probabilities,
                                                  "TRB", i),
                         metadata={
                             **{
                                 label: random.choices(list(label_dict.keys()),
                                                       label_dict.values(),
                                                       k=1)[0]
                                 for label, label_dict in labels.items()
                             },
                             **{
                                 "subject": f"subj_{i + 1}"
                             }
                         }) for i in range(receptor_count)
        ]

        filename = path / "batch01.npy"

        receptor_matrix = np.core.records.fromrecords(
            [receptor.get_record() for receptor in receptors],
            names=TCABReceptor.get_record_names())
        np.save(str(filename), receptor_matrix, allow_pickle=False)

        return ReceptorDataset(labels={
            label: list(label_dict.keys())
            for label, label_dict in labels.items()
        },
                               filenames=[filename],
                               file_size=receptor_count,
                               element_class_name=type(receptors[0]).__name__
                               if len(receptors) > 0 else None)