예제 #1
0
    def create_from_record(cls, record):
        if 'version' in record.dtype.names and record[
                'version'] == BCKReceptor.version:

            heavy_record = record[[
                'heavy_' + name
                for name in ReceptorSequence.get_record_names()
            ]]
            heavy_record.dtype.names = ReceptorSequence.get_record_names()

            kappa_record = record[[
                'kappa_' + name
                for name in ReceptorSequence.get_record_names()
            ]]
            kappa_record.dtype.names = ReceptorSequence.get_record_names()

            return BCKReceptor(
                heavy=ReceptorSequence.create_from_record(heavy_record),
                kappa=ReceptorSequence.create_from_record(kappa_record),
                identifier=record['identifier'],
                metadata=json.loads(record['metadata']))
        else:
            raise NotImplementedError(
                f"Supported ({BCKReceptor.version}) and available version differ, but there is no converter available."
            )
예제 #2
0
    def create_from_record(cls, record: np.void):
        if 'version' in record.dtype.names and record[
                'version'] == TCGDReceptor.version:

            gamma_record = record[[
                'gamma_' + name
                for name in ReceptorSequence.get_record_names()
            ]]
            gamma_record.dtype.names = ReceptorSequence.get_record_names()

            delta_record = record[[
                'delta_' + name
                for name in ReceptorSequence.get_record_names()
            ]]
            delta_record.dtype.names = ReceptorSequence.get_record_names()

            return TCGDReceptor(
                gamma=ReceptorSequence.create_from_record(gamma_record),
                delta=ReceptorSequence.create_from_record(delta_record),
                identifier=record['identifier'],
                metadata=json.loads(record['metadata']))
        else:
            raise NotImplementedError(
                f"Supported ({TCGDReceptor.version}) and available version differ, but there is no converter available."
            )
예제 #3
0
 def get_record_names(cls):
     return ['heavy_' + name for name in ReceptorSequence.get_record_names()] \
            + ['light_' + name for name in ReceptorSequence.get_record_names()] \
            + [name for name in cls.FIELDS if name not in ['heavy', 'light']]
예제 #4
0
    def generate_sequence_dataset(sequence_count: int,
                                  length_probabilities: dict, labels: dict,
                                  path: Path):
        """
        Creates sequence_count receptor sequences (single chain) where the length of sequences in each chain is sampled independently for each sequence from
        length_probabilities distribution. The labels are also randomly assigned to sequences from the distribution given in
        labels. In this case, labels are multi-class, so each sequences will get one class from each label. This means that negative
        classes for the labels should be included as well in the specification.

        An example of input parameters is given below:

        sequence_count: 100 # generate 100 TRB ReceptorSequences
        length_probabilities:
            14: 0.8 # 80% of all generated sequences for all receptors (for chain 1) will have length 14
            15: 0.2 # 20% of all generated sequences across all receptors (for chain 1) will have length 15
        labels:
            epitope1: # label name
                True: 0.5 # 50% of the receptors will have class True
                False: 0.5 # 50% of the receptors will have class False
            epitope2: # next label with classes that will be assigned to receptors independently of the previous label or other parameters
                1: 0.3 # 30% of the generated receptors will have class 1
                0: 0.7 # 70% of the generated receptors will have class 0
        """
        RandomDatasetGenerator._check_sequence_dataset_generation_params(
            sequence_count, length_probabilities, labels, path)

        alphabet = EnvironmentSettings.get_sequence_alphabet()
        PathBuilder.build(path)

        chain = "TRB"

        sequences = [
            ReceptorSequence(
                "".join(
                    random.choices(alphabet,
                                   k=random.choices(
                                       list(length_probabilities.keys()),
                                       length_probabilities.values())[0])),
                metadata=SequenceMetadata(
                    count=1,
                    v_subgroup=chain + "V1",
                    v_gene=chain + "V1-1",
                    v_allele=chain + "V1-1*01",
                    j_subgroup=chain + "J1",
                    j_gene=chain + "J1-1",
                    j_allele=chain + "J1-1*01",
                    chain=chain,
                    custom_params={
                        **{
                            label: random.choices(list(label_dict.keys()),
                                                  label_dict.values(),
                                                  k=1)[0]
                            for label, label_dict in labels.items()
                        },
                        **{
                            "subject": f"subj_{i + 1}"
                        }
                    })) for i in range(sequence_count)
        ]

        filename = path / "batch01.npy"

        sequence_matrix = np.core.records.fromrecords(
            [seq.get_record() for seq in sequences],
            names=ReceptorSequence.get_record_names())
        np.save(str(filename), sequence_matrix, allow_pickle=False)

        return SequenceDataset(labels={
            label: list(label_dict.keys())
            for label, label_dict in labels.items()
        },
                               filenames=[filename],
                               file_size=sequence_count)
예제 #5
0
    def test_make_subset(self):
        sequences = []
        for i in range(100):
            sequences.append(ReceptorSequence(amino_acid_sequence="AAA", identifier=str(i)))

        path = EnvironmentSettings.tmp_test_path / "element_generator_subset/"
        PathBuilder.build(path)

        for i in range(10):
            filepath = path / f"batch{i}.npy"
            sequences_to_pickle = sequences[i * 10:(i + 1) * 10]
            sequence_matrix = np.core.records.fromrecords([seq.get_record() for seq in sequences_to_pickle], names=ReceptorSequence.get_record_names())
            np.save(str(filepath), sequence_matrix, allow_pickle=False)

        d = SequenceDataset(filenames=[path / f"batch{i}.npy" for i in range(10)], file_size=10)

        indices = [1, 20, 21, 22, 23, 24, 25, 50, 52, 60, 70, 77, 78, 90, 92]

        d2 = d.make_subset(indices, path, SequenceDataset.TRAIN)

        for batch in d2.get_batch(1000):
            for sequence in batch:
                self.assertTrue(int(sequence.identifier) in indices)

        self.assertEqual(15, d2.get_example_count())

        shutil.rmtree(path)
예제 #6
0
 def get_record_names(cls):
     return ['alpha_' + name for name in ReceptorSequence.get_record_names()] \
            + ['beta_' + name for name in ReceptorSequence.get_record_names()] \
            + [name for name in cls.FIELDS if name not in ['alpha', 'beta']]
예제 #7
0
 def get_record_names(cls):
     return ['gamma_' + name for name in ReceptorSequence.get_record_names()] \
            + ['delta_' + name for name in ReceptorSequence.get_record_names()] \
            + [name for name in cls.FIELDS if name not in ['gamma', 'delta']]