Exemplo n.º 1
0
    def test_encode(self):
        path = EnvironmentSettings.tmp_test_path + "abundance_encoder/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build([["GGG", "III", "LLL", "MMM"],
                                                         ["DDD", "EEE", "FFF", "III", "LLL", "MMM"],
                                                         ["CCC", "FFF", "MMM"],
                                                         ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]],
                                                        labels={"l1": [True, True, False, False]}, path=path)

        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, identifier="1")

        encoder = SequenceAbundanceEncoder.build_object(dataset, **{
            "comparison_attributes": ["sequence_aas"],
            "p_value_threshold": 0.4, "sequence_batch_size": 4, "repertoire_batch_size": 8
        })

        label_config = LabelConfiguration([Label("l1", [True, False], positive_class=True)])

        encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config))

        self.assertTrue(np.array_equal(np.array([[1, 4], [1, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples))

        encoder.p_value_threshold = 0.05

        encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config))

        self.assertTrue(np.array_equal(np.array([[0, 4], [0, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples))

        shutil.rmtree(path)
Exemplo n.º 2
0
    def test_encode_sequence(self):
        seq = ReceptorSequence(amino_acid_sequence="CASSVFRTY")
        result = KmerSequenceEncoder.encode_sequence(
            seq,
            EncoderParams(model={"k": 3},
                          label_config=LabelConfiguration(),
                          result_path="",
                          pool_size=4))

        self.assertTrue("CAS" in result)
        self.assertTrue("ASS" in result)
        self.assertTrue("SSV" in result)
        self.assertTrue("SVF" in result)
        self.assertTrue("VFR" in result)
        self.assertTrue("FRT" in result)
        self.assertTrue("RTY" in result)

        self.assertEqual(7, len(result))
        self.assertEqual(
            KmerSequenceEncoder.encode_sequence(
                ReceptorSequence(amino_acid_sequence="AC"),
                EncoderParams(model={"k": 3},
                              label_config=LabelConfiguration(),
                              result_path="",
                              pool_size=4)), None)
Exemplo n.º 3
0
    def test_encode_sequence(self):
        sequence = ReceptorSequence(
            amino_acid_sequence="AAA",
            metadata=SequenceMetadata(frame_type="OUT"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(
            enc.encode_sequence(
                sequence,
                EncoderParams(model={},
                              label_config=LabelConfiguration(),
                              result_path="")), ["AAA"])

        sequence = ReceptorSequence(
            amino_acid_sequence="AAA",
            metadata=SequenceMetadata(frame_type="STOP"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(
            enc.encode_sequence(
                sequence,
                EncoderParams(model={},
                              label_config=LabelConfiguration(),
                              result_path="")), ["AAA"])

        sequence = ReceptorSequence(amino_acid_sequence="AAA",
                                    metadata=SequenceMetadata(frame_type="IN"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(["AAA"],
                         enc.encode_sequence(
                             sequence,
                             EncoderParams(model={},
                                           label_config=LabelConfiguration(),
                                           result_path="")))
Exemplo n.º 4
0
    def encode_dataset(dataset,
                       hp_setting: HPSetting,
                       path: str,
                       learn_model: bool,
                       context: dict,
                       number_of_processes: int,
                       label_configuration: LabelConfiguration,
                       encode_labels: bool = True,
                       store_encoded_data: bool = False):
        PathBuilder.build(path)

        encoded_dataset = DataEncoder.run(
            DataEncoderParams(dataset=dataset,
                              encoder=hp_setting.encoder,
                              encoder_params=EncoderParams(
                                  model=hp_setting.encoder_params,
                                  result_path=path,
                                  pool_size=number_of_processes,
                                  label_config=label_configuration,
                                  learn_model=learn_model,
                                  filename="train_dataset.pkl"
                                  if learn_model else "test_dataset.pkl",
                                  encode_labels=encode_labels),
                              store_encoded_data=store_encoded_data))
        return encoded_dataset
Exemplo n.º 5
0
    def test_generate(self):
        path = PathBuilder.build(f"{EnvironmentSettings.tmp_test_path}kernel_sequence_logo/")
        dataset = RandomDatasetGenerator.generate_receptor_dataset(receptor_count=500, chain_1_length_probabilities={4: 1},
                                                                   chain_2_length_probabilities={4: 1},
                                                                   labels={"CMV": {True: 0.5, False: 0.5}}, path=path + "dataset/")
        enc_dataset = OneHotReceptorEncoder(True, 1, False, "enc1").encode(dataset, EncoderParams(path + "result/",
                                                                                                  LabelConfiguration([Label("CMV", [True, False])])))
        cnn = ReceptorCNN(kernel_count=2, kernel_size=[3], positional_channels=3, sequence_type="amino_acid", device="cpu",
                          number_of_threads=4, random_seed=1, learning_rate=0.01, iteration_count=10, l1_weight_decay=0.1, evaluate_at=5,
                          batch_size=100, training_percentage=0.8, l2_weight_decay=0.0)
        cnn.fit(enc_dataset.encoded_data, "CMV")

        report = KernelSequenceLogo(method=cnn, result_path=path + "logos/")
        report.generate_report()

        self.assertTrue(os.path.isfile(f"{path}logos/alpha_kernel_3_1.png"))
        self.assertTrue(os.path.isfile(f"{path}logos/alpha_kernel_3_2.png"))
        self.assertTrue(os.path.isfile(f"{path}logos/beta_kernel_3_1.png"))
        self.assertTrue(os.path.isfile(f"{path}logos/beta_kernel_3_2.png"))
        self.assertTrue(os.path.isfile(f"{path}logos/alpha_kernel_3_1.csv"))
        self.assertTrue(os.path.isfile(f"{path}logos/alpha_kernel_3_2.csv"))
        self.assertTrue(os.path.isfile(f"{path}logos/beta_kernel_3_1.csv"))
        self.assertTrue(os.path.isfile(f"{path}logos/beta_kernel_3_2.csv"))
        self.assertTrue(os.path.isfile(f"{path}logos/fully_connected_layer_weights.csv"))
        self.assertTrue(os.path.isfile(f"{path}logos/fully_connected_layer_weights.html"))

        shutil.rmtree(path)
Exemplo n.º 6
0
    def test_encode_no_v(self):
        path = EnvironmentSettings.root_path + "test/tmp/regex_matches_encoder/"

        dataset, label_config, motif_filepath, labels = self.create_dummy_data(
            path)

        encoder = MatchedRegexEncoder.build_object(
            dataset, **{
                "motif_filepath": motif_filepath,
                "match_v_genes": False,
                "sum_counts": True
            })

        encoded = encoder.encode(
            dataset,
            EncoderParams(result_path=path,
                          label_config=label_config,
                          filename="dataset.csv"))

        expected_outcome = [[20, 10, 0, 0], [0, 0, 10, 0], [0, 0, 0, 5]]

        for index, row in enumerate(expected_outcome):
            self.assertListEqual(list(encoded.encoded_data.examples[index]),
                                 expected_outcome[index])

        self.assertListEqual(["1_IGL", "1_IGH", "2_IGH", "3_IGL"],
                             encoded.encoded_data.feature_names)
        self.assertListEqual(["subject_1", "subject_2", "subject_3"],
                             encoded.encoded_data.example_ids)

        shutil.rmtree(path)
Exemplo n.º 7
0
    def test_sequence_flattened(self):
        path = EnvironmentSettings.root_path + "test/tmp/onehot_seq_flat/"

        PathBuilder.build(path)

        dataset = self.construct_test_flatten_dataset(path)

        encoder = OneHotEncoder.build_object(dataset, **{"use_positional_info": False, "distance_to_seq_middle": None, "flatten": True})

        encoded_data = encoder.encode(dataset, EncoderParams(
            result_path=path,
            label_config=LabelConfiguration([Label(name="l1", values=[1, 0], positive_class="1")]),
            pool_size=1,
            learn_model=True,
            model={},
            filename="dataset.pkl"
        ))

        self.assertTrue(isinstance(encoded_data, SequenceDataset))

        onehot_a = [1.0] + [0.0] * 19
        onehot_t = [0.0] * 16 + [1.0] + [0] * 3

        self.assertListEqual(list(encoded_data.encoded_data.examples[0]), onehot_a+onehot_a+onehot_a+onehot_t+onehot_t+onehot_t)
        self.assertListEqual(list(encoded_data.encoded_data.examples[1]), onehot_a+onehot_t+onehot_a+onehot_t+onehot_a+onehot_t)

        self.assertListEqual(list(encoded_data.encoded_data.feature_names), [f"{pos}_{char}" for pos in range(6) for char in EnvironmentSettings.get_sequence_alphabet()])
        shutil.rmtree(path)
Exemplo n.º 8
0
    def test(self):
        path = EnvironmentSettings.tmp_test_path + "onehot_sequence/"
        PathBuilder.build(path)

        dataset, lc = self._construct_test_dataset(path)

        encoder = OneHotEncoder.build_object(dataset, **{"use_positional_info": False,
                                                         "distance_to_seq_middle": None,
                                                         "flatten": False})

        encoded_data = encoder.encode(dataset, EncoderParams(
            result_path=f"{path}encoded/",
            label_config=lc,
            learn_model=True,
            model={},
            filename="dataset.pkl"
        ))

        self.assertTrue(isinstance(encoded_data, SequenceDataset))

        onehot_a = [1] + [0] * 19
        onehot_t = [0] * 16 + [1] + [0] * 3
        onehot_empty = [0] * 20

        self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[0]], [onehot_a for i in range(4)])
        self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[1]], [onehot_a, onehot_t, onehot_a, onehot_empty])
        self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[2]], [onehot_a, onehot_t, onehot_t, onehot_empty])

        self.assertListEqual(encoded_data.encoded_data.example_ids, [receptor.identifier for receptor in dataset.get_data()])
        self.assertDictEqual(encoded_data.encoded_data.labels,
                             {"l1": [receptor_seq.get_attribute("l1") for receptor_seq in dataset.get_data()],
                              "l2": [receptor_seq.get_attribute("l2") for receptor_seq in dataset.get_data()]})

        shutil.rmtree(path)
    def get_encoded_repertoire(self, repertoire, params: EncoderParams):

        params.model = vars(self)

        return CacheHandler.memo_by_params((("encoding_model", params.model),
                                            ("labels", params.label_config.get_labels_by_name()),
                                            ("repertoire_id", repertoire.identifier),
                                            ("repertoire_data",  hashlib.sha256(np.ascontiguousarray(repertoire.get_sequence_aas())).hexdigest())),
                                           lambda: self.encode_repertoire(repertoire, params), CacheObjectType.ENCODING_STEP)
    def test_run(self):

        path = EnvironmentSettings.tmp_test_path + "mlapplicationtest/"
        PathBuilder.build(path)

        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            50, {5: 1}, {5: 1}, {"l1": {
                1: 0.5,
                2: 0.5
            }}, path + 'dataset/')
        ml_method = LogisticRegression()
        encoder = KmerFreqRepertoireEncoder(
            NormalizationType.RELATIVE_FREQUENCY,
            ReadsType.UNIQUE,
            SequenceEncodingType.CONTINUOUS_KMER,
            3,
            scale_to_zero_mean=True,
            scale_to_unit_variance=True)
        label_config = LabelConfiguration([Label("l1", [1, 2])])

        enc_dataset = encoder.encode(
            dataset,
            EncoderParams(result_path=path,
                          label_config=label_config,
                          filename="tmp_enc_dataset.pickle",
                          pool_size=4))
        ml_method.fit(enc_dataset.encoded_data, 'l1')

        hp_setting = HPSetting(
            encoder, {
                "normalization_type": "relative_frequency",
                "reads": "unique",
                "sequence_encoding": "continuous_kmer",
                "k": 3,
                "scale_to_zero_mean": True,
                "scale_to_unit_variance": True
            }, ml_method, {}, [], 'enc1', 'ml1')

        PathBuilder.build(path + 'result/instr1/')
        shutil.copy(path + 'dict_vectorizer.pickle',
                    path + 'result/instr1/dict_vectorizer.pickle')
        shutil.copy(path + 'scaler.pickle',
                    path + 'result/instr1/scaler.pickle')

        ml_app = MLApplicationInstruction(dataset, label_config, hp_setting, 4,
                                          "instr1", False)
        ml_app.run(path + 'result/')

        predictions_path = path + "result/instr1/predictions.csv"
        self.assertTrue(os.path.isfile(predictions_path))

        df = pd.read_csv(predictions_path)
        self.assertEqual(50, df.shape[0])

        shutil.rmtree(path)
Exemplo n.º 11
0
 def _encode_sequence(self, sequence: ReceptorSequence,
                      params: EncoderParams, sequence_encoder, counts):
     params.model = vars(self)
     features = sequence_encoder.encode_sequence(sequence, params)
     if features is not None:
         for i in features:
             if self.reads == ReadsType.UNIQUE:
                 counts[i] += 1
             elif self.reads == ReadsType.ALL:
                 counts[i] += sequence.metadata.count
     return counts
Exemplo n.º 12
0
    def test_encode(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path + "atchley_kmer_encoding/")
        dataset = RandomDatasetGenerator.generate_repertoire_dataset(3, {1: 1}, {4: 1}, {"l1": {True: 0.4, False: 0.6}}, path + "dataset/")

        encoder = AtchleyKmerEncoder.build_object(dataset, **{"k": 2, "skip_first_n_aa": 1, "skip_last_n_aa": 1, "abundance": "RELATIVE_ABUNDANCE",
                                                              "normalize_all_features": False})
        encoded_dataset = encoder.encode(dataset, EncoderParams(path + "result/", LabelConfiguration(labels=[Label("l1")])))

        self.assertEqual((3, 11, 3), encoded_dataset.encoded_data.examples.shape)
        self.assertEqual(0., encoded_dataset.encoded_data.examples[0, -1, 0])

        shutil.rmtree(path)
    def test_encode_sequence(self):
        sequence = ReceptorSequence("ABCDEFG", None, None)
        result = GappedKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(model={"k_left": 3, "max_gap": 1},
                                                                                   label_config=LabelConfiguration(),
                                                                                   result_path=""))
        self.assertEqual({'ABC.EFG', 'ABCDEF', 'BCDEFG'}, set(result))
        result = GappedKmerSequenceEncoder.get_feature_names(EncoderParams(model={"k_left": 3, "max_gap": 1},
                                                                           label_config=LabelConfiguration(),
                                                                           result_path=""))
        self.assertEqual({'sequence'}, set(result))

        self.assertEqual(GappedKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(model={"k_left": 10, "max_gap": 1},
                                                                                           label_config=LabelConfiguration(),
                                                                                           result_path="")),
                         None)

        sequence.amino_acid_sequence = "ABCDEFG"
        result = GappedKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(model={"k_left": 3, "max_gap": 1},
                                                                                   label_config=LabelConfiguration(),
                                                                                   result_path=""))
        self.assertEqual({'ABC.EFG', 'ABCDEF', 'BCDEFG'}, set(result))
        result = GappedKmerSequenceEncoder.get_feature_names(EncoderParams(model={"k_left": 3, "max_gap": 1},
                                                                           label_config=LabelConfiguration(),
                                                                           result_path=""))
        self.assertEqual({'sequence'}, set(result))

        self.assertEqual(GappedKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(model={"k_left": 10, "max_gap": 1},
                                                                                           label_config=LabelConfiguration(),
                                                                                           result_path="")),
                         None)

        sequence.amino_acid_sequence = "ABCDEFG"
        result = GappedKmerSequenceEncoder.encode_sequence(sequence,
                                                           EncoderParams(model={"k_left": 2,
                                                                                "max_gap": 1,
                                                                                "min_gap": 1,
                                                                                "k_right": 3},
                                                                         label_config=LabelConfiguration(),
                                                                         result_path=""))
        self.assertEqual({'AB.DEF', 'BC.EFG'}, set(result))
        result = GappedKmerSequenceEncoder.get_feature_names(EncoderParams(model={"k_left": 2,
                                                                                  "max_gap": 1,
                                                                                  "min_gap": 1,
                                                                                  "k_right": 3},
                                                                           label_config=LabelConfiguration(),
                                                                           result_path=""))
        self.assertEqual({'sequence'}, set(result))
    def test_encode_sequence(self):
        sequence = ReceptorSequence("CASSPRERATYEQCASSPRERATYEQCASSPRERATYEQ", None, None)
        result = IMGTKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(
                                                                    model={"k": 3},
                                                                    label_config=LabelConfiguration(),
                                                                    result_path=""))

        self.assertEqual({'CAS///105', 'ASS///106', 'SSP///107', 'SPR///108', 'PRE///109', 'RER///110', 'ERA///111',
                          'RAT///111.001', 'ATY///111.002', 'TYE///111.003', 'YEQ///111.004', 'EQC///111.005',
                          'QCA///111.006', 'CAS///111.007', 'ASS///111.008', 'SSP///111.009', 'SPR///111.01',
                          'PRE///111.011', 'RER///111.012', 'ERA///111.013', 'RAT///112.013', 'ATY///112.012',
                          'TYE///112.011', 'YEQ///112.01', 'EQC///112.009', 'QCA///112.008', 'CAS///112.007',
                          'ASS///112.006', 'SSP///112.005', 'SPR///112.004', 'PRE///112.003', 'RER///112.002',
                          'ERA///112.001', 'RAT///112', 'ATY///113', 'TYE///114', 'YEQ///115'},
                         set(result))

        self.assertEqual(len(result), len(sequence.get_sequence()) - 3 + 1)

        sequence = ReceptorSequence("AHCDE", None, None)
        result = IMGTKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(
                                                                    model={"k": 3},
                                                                    label_config=LabelConfiguration(),
                                                                    result_path=""))

        self.assertEqual({'AHC///105', 'HCD///106', 'CDE///107'},
                         set(result))

        self.assertEqual(len(result), len(sequence.get_sequence()) - 3 + 1)
        self.assertEqual(
            IMGTKmerSequenceEncoder.encode_sequence(
                              sequence,
                              EncoderParams(model={"k": 25},
                                            label_config=LabelConfiguration(),
                                            result_path="")
            ),
            None
        )
Exemplo n.º 15
0
    def test_encode_sequence(self):
        sequence = ReceptorSequence("AHCDE", None, None)
        kmers = IMGTGappedKmerEncoder.encode_sequence(
            sequence,
            EncoderParams(model={
                "k_left": 1,
                "max_gap": 1
            },
                          label_config=LabelConfiguration(),
                          result_path=""))

        self.assertEqual(
            {
                'AH///105', 'HC///106', 'CD///107', 'DE///116', 'A.C///105',
                'H.D///106', 'C.E///107'
            }, set(kmers))

        sequence = ReceptorSequence("CASSPRERATYEQCAY", None, None)
        kmers = IMGTGappedKmerEncoder.encode_sequence(
            sequence,
            EncoderParams(model={
                "k_left": 1,
                "max_gap": 1
            },
                          label_config=LabelConfiguration(),
                          result_path=""))

        self.assertEqual(
            {
                'CA///105', 'AS///106', 'SS///107', 'SP///108', 'PR///109',
                'RE///110', 'ER///111', 'RA///111.001', 'AT///112.002',
                'TY///112.001', 'YE///112', 'EQ///113', 'QC///114', 'CA///115',
                'AY///116', 'C.S///105', 'A.S///106', 'S.P///107', 'S.R///108',
                'P.E///109', 'R.R///110', 'E.A///111', 'R.T///111.001',
                'A.Y///112.002', 'T.E///112.001', 'Y.Q///112', 'E.C///113',
                'Q.A///114', 'C.Y///115'
            }, set(kmers))
Exemplo n.º 16
0
    def test_generate(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path +
                                 "tcrdist_motif_discovery/")
        dataset_path = self._create_dataset(path)

        dataset = SingleLineReceptorImport.import_dataset(
            {
                "path":
                dataset_path,
                "result_path":
                path + "dataset/",
                "separator":
                ",",
                "columns_to_load": [
                    "subject", "epitope", "count", "v_a_gene", "j_a_gene",
                    "cdr3_a_aa", "v_b_gene", "j_b_gene", "cdr3_b_aa",
                    "clone_id", "cdr3_a_nucseq", "cdr3_b_nucseq"
                ],
                "column_mapping": {
                    "cdr3_a_aa": "alpha_amino_acid_sequence",
                    "cdr3_b_aa": "beta_amino_acid_sequence",
                    "cdr3_a_nucseq": "alpha_nucleotide_sequence",
                    "cdr3_b_nucseq": "beta_nucleotide_sequence",
                    "v_a_gene": "alpha_v_gene",
                    "v_b_gene": "beta_v_gene",
                    "j_a_gene": "alpha_j_gene",
                    "j_b_gene": "beta_j_gene",
                    "clone_id": "identifier"
                },
                "receptor_chains":
                "TRA_TRB",
                "region_type":
                "IMGT_CDR3",
                "sequence_file_size":
                50000,
                "organism":
                "mouse"
            }, 'd1')

        dataset = TCRdistEncoder(8).encode(
            dataset,
            EncoderParams(f"{path}result/",
                          LabelConfiguration([Label("epitope")])))

        report = TCRdistMotifDiscovery(dataset, path + "report/",
                                       "report name", 8)
        report.generate_report()

        shutil.rmtree(path)
Exemplo n.º 17
0
    def test__encode_new_dataset(self):
        path = EnvironmentSettings.root_path + "test/tmp/matched_receptors_encoder/"

        dataset, label_config, reference_sequences, labels = self.create_dummy_data(
            path)

        encoder = MatchedSequencesEncoder.build_object(
            dataset, **{
                "reference": reference_sequences,
                "max_edit_distance": 0
            })

        encoded = encoder.encode(
            dataset,
            EncoderParams(result_path=path,
                          label_config=label_config,
                          filename="dataset.csv"))

        expected_outcome = [[10, 0], [0, 10], [0, 5]]
        for index, row in enumerate(expected_outcome):
            self.assertListEqual(list(encoded.encoded_data.examples[index]),
                                 expected_outcome[index])

        self.assertDictEqual(
            encoded.encoded_data.labels, {
                "label": ["yes", "yes", "no"],
                "subject_id": ["subject_1", "subject_2", "subject_3"]
            })
        self.assertListEqual(encoded.encoded_data.feature_names,
                             ["100_TRB", "200_TRB"])

        self.assertListEqual(
            list(encoded.encoded_data.feature_annotations.sequence_id),
            ["100_TRB", "200_TRB"])
        self.assertListEqual(
            list(encoded.encoded_data.feature_annotations.chain),
            ["beta", "beta"])
        self.assertListEqual(
            list(encoded.encoded_data.feature_annotations.sequence),
            ["AAAA", "SSSS"])
        self.assertListEqual(
            list(encoded.encoded_data.feature_annotations.v_gene),
            ["TRBV1", "TRBV1"])
        self.assertListEqual(
            list(encoded.encoded_data.feature_annotations.j_gene),
            ["TRBJ1", "TRBJ1"])

        shutil.rmtree(path)
Exemplo n.º 18
0
    def test_run(self):
        path = EnvironmentSettings.root_path + "test/tmp/dataencoder/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects(
            [ReceptorSequence("AAA", identifier="1")],
            metadata={
                "l1": 1,
                "l2": 2
            },
            path=path)

        rep2 = Repertoire.build_from_sequence_objects(
            [ReceptorSequence("ATA", identifier="2")],
            metadata={
                "l1": 0,
                "l2": 3
            },
            path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])
        encoder = Word2VecEncoder.build_object(
            dataset, **{
                "k": 3,
                "model_type": ModelType.SEQUENCE.name,
                "vector_size": 6
            })

        res = DataEncoder.run(
            DataEncoderParams(dataset=dataset,
                              encoder=encoder,
                              encoder_params=EncoderParams(
                                  model={},
                                  pool_size=2,
                                  label_config=lc,
                                  result_path=path,
                                  filename="dataset.csv"),
                              store_encoded_data=False))

        self.assertTrue(isinstance(res, RepertoireDataset))
        self.assertTrue(res.encoded_data.examples.shape[0] == 2)

        shutil.rmtree(path)
Exemplo n.º 19
0
    def test_repertoire_flattened(self):
        path = EnvironmentSettings.root_path + "test/tmp/onehot_recep_flat/"

        PathBuilder.build(path)

        dataset, lc = self._construct_test_repertoiredataset(path,
                                                             positional=False)

        encoder = OneHotEncoder.build_object(
            dataset, **{
                "use_positional_info": False,
                "distance_to_seq_middle": None,
                "flatten": True
            })

        encoded_data = encoder.encode(
            dataset,
            EncoderParams(result_path=path,
                          label_config=lc,
                          pool_size=1,
                          learn_model=True,
                          model={},
                          filename="dataset.pkl"))

        self.assertTrue(isinstance(encoded_data, RepertoireDataset))

        onehot_a = [1.0] + [0.0] * 19
        onehot_t = [0.0] * 16 + [1.0] + [0] * 3
        onehot_empty = [0] * 20

        self.assertListEqual(
            list(encoded_data.encoded_data.examples[0]), onehot_a + onehot_a +
            onehot_a + onehot_a + onehot_a + onehot_t + onehot_a +
            onehot_empty + onehot_a + onehot_t + onehot_a + onehot_empty)
        self.assertListEqual(
            list(encoded_data.encoded_data.examples[1]),
            onehot_a + onehot_t + onehot_a + onehot_empty + onehot_t +
            onehot_a + onehot_a + onehot_empty + onehot_empty + onehot_empty +
            onehot_empty + onehot_empty)

        self.assertListEqual(list(encoded_data.encoded_data.feature_names), [
            f"{seq}_{pos}_{char}" for seq in range(3) for pos in range(4)
            for char in EnvironmentSettings.get_sequence_alphabet()
        ])

        shutil.rmtree(path)
Exemplo n.º 20
0
    def test_encode(self):

        test_path = EnvironmentSettings.root_path + "test/tmp/w2v/"

        PathBuilder.build(test_path)

        sequence1 = ReceptorSequence("CASSVFA", identifier="1")
        sequence2 = ReceptorSequence("CASSCCC", identifier="2")

        metadata1 = {"T1D": "T1D", "subject_id": "1"}
        rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2],
                                                      test_path, metadata1)

        metadata2 = {"T1D": "CTL", "subject_id": "2"}
        rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path,
                                                      metadata2)

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        label_configuration = LabelConfiguration()
        label_configuration.add_label("T1D", ["T1D", "CTL"])

        config_params = EncoderParams(model={},
                                      learn_model=True,
                                      result_path=test_path,
                                      label_config=label_configuration,
                                      filename="dataset.pkl")

        encoder = Word2VecEncoder.build_object(
            dataset, **{
                "k": 3,
                "model_type": "sequence",
                "vector_size": 16
            })

        encoded_dataset = encoder.encode(dataset=dataset, params=config_params)

        self.assertIsNotNone(encoded_dataset.encoded_data)
        self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == 2)
        self.assertTrue(encoded_dataset.encoded_data.examples.shape[1] == 16)
        self.assertTrue(len(encoded_dataset.encoded_data.labels["T1D"]) == 2)
        self.assertTrue(encoded_dataset.encoded_data.labels["T1D"][0] == "T1D")
        self.assertTrue(isinstance(encoder, W2VRepertoireEncoder))

        shutil.rmtree(test_path)
Exemplo n.º 21
0
def encode_dataset_by_kmer_freq(path_to_dataset_directory: str, result_path: str, metadata_path: str = None):
    """
    encodes the repertoire dataset using KmerFrequencyEncoder
    :param path_to_dataset_directory: path to directory containing all repertoire files with .tsv extension in MiXCR format
    :param result_path: where to store the results
    :param metadata_path: csv file with columns "filename", "subject_id", "disease" which is filled by default if value of argument is None,
                          otherwise any metadata csv file passed to the function, must include filename and subject_id columns,
                          and an arbitrary disease column
    :return: encoded dataset with encoded data in encoded_dataset.encoded_data.examples
    """
    if metadata_path is None:
        metadata_path = generate_random_metadata(path_to_dataset_directory, result_path)

    loader = MiXCRImport()
    dataset = loader.import_dataset({
        "is_repertoire": True,
        "path": path_to_dataset_directory,
        "metadata_file": metadata_path,
        "region_type": "IMGT_CDR3",  # import_dataset in only cdr3
        "number_of_processes": 4,  # number of parallel processes for loading the data
        "result_path": result_path,
        "separator": "\t",
        "columns_to_load": ["cloneCount", "allVHitsWithScore", "allJHitsWithScore", "aaSeqCDR3", "nSeqCDR3"],
        "column_mapping": {
            "cloneCount": "counts",
            "allVHitsWithScore": "v_genes",
            "allJHitsWithScore": "j_genes"
        },
    }, "mixcr_dataset")

    label_name = list(dataset.params.keys())[0]  # label that can be used for ML prediction - by default: "disease" with values True/False

    encoded_dataset = DataEncoder.run(DataEncoderParams(dataset, KmerFrequencyEncoder.build_object(dataset, **{
        "normalization_type": "relative_frequency",  # encode repertoire by the relative frequency of k-mers in repertoire
        "reads": "unique",  # count each sequence only once, do not use clonal count
        "k": 2,  # k-mer length
        "sequence_encoding": "continuous_kmer"  # split each sequence in repertoire to overlapping k-mers
    }), EncoderParams(result_path=result_path,
                      label_config=LabelConfiguration([Label(label_name, dataset.params[label_name])])), False))

    dataset_exporter = DesignMatrixExporter(dataset=encoded_dataset,
                                            result_path=f"{result_path if result_path[:-1] == '/' else result_path+'/'}csv_exported/")
    dataset_exporter.generate_report()

    return encoded_dataset
Exemplo n.º 22
0
    def test(self):
        path = EnvironmentSettings.tmp_test_path + "onehot_sequence_1/"
        PathBuilder.build(path)

        dataset, lc = self._construct_test_dataset(path)

        encoder = OneHotEncoder.build_object(
            dataset, **{
                "use_positional_info": False,
                "distance_to_seq_middle": 6,
                "flatten": False
            })

        encoded_data = encoder.encode(
            dataset,
            EncoderParams(result_path=f"{path}encoded/",
                          label_config=lc,
                          learn_model=True,
                          model={},
                          filename="dataset.pkl"))

        self.assertTrue(isinstance(encoded_data, ReceptorDataset))

        onehot_a = [1] + [0] * 19
        onehot_t = [0] * 16 + [1] + [0] * 3
        onehot_empty = [0] * 20

        self.assertListEqual(
            [list(item) for item in encoded_data.encoded_data.examples[0, 0]],
            [onehot_a for i in range(4)])
        self.assertListEqual(
            [list(item) for item in encoded_data.encoded_data.examples[0, 1]],
            [onehot_a, onehot_t, onehot_a, onehot_empty])

        self.assertListEqual(
            [list(item) for item in encoded_data.encoded_data.examples[1, 0]],
            [onehot_a, onehot_t, onehot_a, onehot_empty])
        self.assertListEqual(
            [list(item) for item in encoded_data.encoded_data.examples[1, 1]],
            [onehot_a, onehot_t, onehot_t, onehot_empty])

        shutil.rmtree(path)
Exemplo n.º 23
0
    def test_encode(self):

        file_content = """complex.id	Gene	CDR3	V	J	Species	MHC A	MHC B	MHC class	Epitope	Epitope gene	Epitope species	Reference	Method	Meta	CDR3fix	Score
3050	TRB	CASSPPRVYSNGAGLAGVGWRNEQFF	TRBV5-4*01	TRBJ2-1*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"}	0
15760	TRB	CASSWTWDAATLWGQGALGGANVLTF	TRBV5-5*01	TRBJ2-6*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"}	0
3050	TRA	CAAIYESRGSTLGRLYF	TRAV13-1*01	TRAJ18*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"}	0
15760	TRA	CALRLNNQGGKLIF	TRAV9-2*01	TRAJ23*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"}	0
3051	TRB	CASSPPRVYSNGAGLAGVGWRNEQFF	TRBV5-4*01	TRBJ2-1*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"}	0
15761	TRB	CASSWTWDAATLWGQGALGGANVLTF	TRBV5-5*01	TRBJ2-6*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"}	0
3051	TRA	CAAIYESRGSTLGRLYF	TRAV13-1*01	TRAJ18*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"}	0
15761	TRA	CALRLNNQGGKLIF	TRAV9-2*01	TRAJ23*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"}	0
        """
        path = PathBuilder.build(EnvironmentSettings.root_path +
                                 "test/tmp/trcdist_encoder/")

        with open(path + "receptors.tsv", "w") as file:
            file.writelines(file_content)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/", "vdjdb")
        params["is_repertoire"] = False
        params["paired"] = True
        params["result_path"] = path
        params["path"] = path
        params["sequence_file_size"] = 1
        params["receptor_chains"] = "TRA_TRB"
        params['organism'] = 'human'

        dataset = VDJdbImport.import_dataset(params, "vdjdb_dataset")

        encoder = TCRdistEncoder.build_object(dataset, **{"cores": 2})
        encoded_dataset = encoder.encode(
            dataset,
            EncoderParams(f"{path}result/",
                          LabelConfiguration([Label("epitope")])))

        self.assertTrue(encoded_dataset.encoded_data.examples.shape[0]
                        == encoded_dataset.encoded_data.examples.shape[1]
                        and encoded_dataset.encoded_data.examples.shape[0]
                        == dataset.get_example_count())

        shutil.rmtree(path)
Exemplo n.º 24
0
    def test_encode(self):

        path = EnvironmentSettings.tmp_test_path + "count_encoder/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build(
            [["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]],
            labels={"l1": [True, True, False, False]},
            path=path)

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    identifier="1")

        encoder = SequenceCountEncoder.build_object(
            dataset, **{
                "comparison_attributes": ["sequence_aas"],
                "p_value_threshold": 0.4,
                "sequence_batch_size": 4
            })

        label_config = LabelConfiguration(
            [Label("l1", [True, False], positive_class=True)])

        encoded_dataset = encoder.encode(
            dataset, EncoderParams(result_path=path,
                                   label_config=label_config))

        test = encoded_dataset.encoded_data.examples

        self.assertTrue(test[0] == 1)
        self.assertTrue(test[1] == 1)
        self.assertTrue(test[2] == 0)
        self.assertTrue(test[3] == 0)

        self.assertTrue("III" in encoded_dataset.encoded_data.feature_names)

        shutil.rmtree(path)
Exemplo n.º 25
0
    def test_encode(self):
        path = EnvironmentSettings.tmp_test_path + "distance_encoder/"
        PathBuilder.build(path)

        dataset = self.create_dataset(path)

        enc = DistanceEncoder.build_object(
            dataset, **{
                "distance_metric": DistanceMetricType.JACCARD.name,
                "attributes_to_match": ["sequence_aas"],
                "sequence_batch_size": 20
            })

        enc.set_context({"dataset": dataset})
        encoded = enc.encode(
            dataset,
            EncoderParams(result_path=path,
                          label_config=LabelConfiguration(
                              [Label("l1", [0, 1]),
                               Label("l2", [2, 3])]),
                          pool_size=4,
                          filename="dataset.pkl"))

        self.assertEqual(8, encoded.encoded_data.examples.shape[0])
        self.assertEqual(8, encoded.encoded_data.examples.shape[1])

        self.assertEqual(1, encoded.encoded_data.examples.iloc[0, 0])
        self.assertEqual(1, encoded.encoded_data.examples.iloc[1, 1])
        self.assertEqual(1, encoded.encoded_data.examples.iloc[0, 4])

        self.assertTrue(
            np.array_equal([1, 0, 1, 0, 1, 0, 1, 0],
                           encoded.encoded_data.labels["l1"]))
        self.assertTrue(
            np.array_equal([2, 3, 2, 3, 2, 3, 3, 3],
                           encoded.encoded_data.labels["l2"]))

        shutil.rmtree(path)
Exemplo n.º 26
0
    def test_encode(self):
        path = EnvironmentSettings.tmp_test_path + "deeprc_encoder/"
        PathBuilder.build(path)
        PathBuilder.build(path + "encoded_data/")

        main_dataset, sub_dataset = self.create_datasets(path)

        enc = DeepRCEncoder.build_object(sub_dataset, **{})

        enc.set_context({"dataset": main_dataset})

        encoded = enc.encode(
            sub_dataset,
            EncoderParams(result_path=path + "encoded_data/",
                          label_config=LabelConfiguration(
                              [Label("l1", [0, 1]),
                               Label("l2", [2, 3])]),
                          pool_size=4))

        self.assertListEqual(encoded.encoded_data.example_ids,
                             sub_dataset.get_repertoire_ids())
        self.assertTrue(
            os.path.isfile(encoded.encoded_data.info["metadata_filepath"]))

        metadata_content = pd.read_csv(
            encoded.encoded_data.info["metadata_filepath"], sep="\t")
        self.assertListEqual(list(metadata_content["ID"]),
                             sub_dataset.get_repertoire_ids())

        for repertoire in main_dataset.repertoires:
            rep_path = f"{path}/encoded_data/encoding/{repertoire.identifier}.tsv"
            self.assertTrue(os.path.isfile(rep_path))
            repertoire_tsv = pd.read_csv(rep_path, sep="\t")
            self.assertListEqual(list(repertoire_tsv["amino_acid"]),
                                 list(repertoire.get_sequence_aas()))

        shutil.rmtree(path)
    def test_fit(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path +
                                 "kmermil/")

        repertoire_count = 10
        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            repertoire_count=repertoire_count,
            sequence_count_probabilities={2: 1},
            sequence_length_probabilities={4: 1},
            labels={"l1": {
                True: 0.5,
                False: 0.5
            }},
            path=path + "dataset/")
        enc_dataset = AtchleyKmerEncoder(
            2, 1, 1, 'relative_abundance', False).encode(
                dataset,
                EncoderParams(path + "result/",
                              LabelConfiguration([Label("l1",
                                                        [True, False])])))
        cls = AtchleyKmerMILClassifier(iteration_count=10,
                                       threshold=-0.0001,
                                       evaluate_at=2,
                                       use_early_stopping=False,
                                       random_seed=1,
                                       learning_rate=0.01,
                                       zero_abundance_weight_init=True,
                                       number_of_threads=8)
        cls.fit(enc_dataset.encoded_data, "l1")

        predictions = cls.predict(enc_dataset.encoded_data, "l1")
        self.assertEqual(repertoire_count, len(predictions["l1"]))
        self.assertEqual(
            repertoire_count,
            len([pred for pred in predictions["l1"]
                 if isinstance(pred, bool)]))

        predictions_proba = cls.predict_proba(enc_dataset.encoded_data, "l1")
        self.assertEqual(repertoire_count,
                         np.rint(np.sum(predictions_proba["l1"])))
        self.assertEqual(repertoire_count, predictions_proba["l1"].shape[0])

        cls.store(path + "model_storage/",
                  feature_names=enc_dataset.encoded_data.feature_names)

        cls2 = AtchleyKmerMILClassifier(iteration_count=10,
                                        threshold=-0.0001,
                                        evaluate_at=2,
                                        use_early_stopping=False,
                                        random_seed=1,
                                        learning_rate=0.01,
                                        zero_abundance_weight_init=True,
                                        number_of_threads=8)
        cls2.load(path + "model_storage/")

        cls2_vars = vars(cls2)
        del cls2_vars["logistic_regression"]
        cls_vars = vars(cls)
        del cls_vars["logistic_regression"]

        for item, value in cls_vars.items():
            if not isinstance(value, np.ndarray):
                loaded_value = cls2_vars[item]
                self.assertEqual(value, loaded_value)

        model = cls.get_model("l1")
        self.assertEqual(vars(cls), model)

        shutil.rmtree(path)
Exemplo n.º 28
0
    def test_fit(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path + "cnn/")

        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            receptor_count=500,
            chain_1_length_probabilities={4: 1},
            chain_2_length_probabilities={4: 1},
            labels={"CMV": {
                True: 0.5,
                False: 0.5
            }},
            path=path + "dataset/")
        enc_dataset = OneHotReceptorEncoder(True, 1, False, "enc1").encode(
            dataset,
            EncoderParams(path + "result/",
                          LabelConfiguration([Label("CMV", [True, False])])))
        cnn = ReceptorCNN(kernel_count=2,
                          kernel_size=[3],
                          positional_channels=3,
                          sequence_type="amino_acid",
                          device="cpu",
                          number_of_threads=4,
                          random_seed=1,
                          learning_rate=0.01,
                          iteration_count=10,
                          l1_weight_decay=0.1,
                          evaluate_at=5,
                          batch_size=100,
                          training_percentage=0.8,
                          l2_weight_decay=0.0)
        cnn.fit(encoded_data=enc_dataset.encoded_data, label_name="CMV")

        predictions = cnn.predict(enc_dataset.encoded_data, "CMV")
        self.assertEqual(500, len(predictions["CMV"]))
        self.assertEqual(
            500,
            len([
                pred for pred in predictions["CMV"] if isinstance(pred, bool)
            ]))

        predictions_proba = cnn.predict_proba(enc_dataset.encoded_data, "CMV")
        self.assertEqual(500, np.rint(np.sum(predictions_proba["CMV"])))
        self.assertEqual(500, predictions_proba["CMV"].shape[0])

        cnn.store(path + "model_storage/")

        cnn2 = ReceptorCNN(sequence_type="amino_acid")
        cnn2.load(path + "model_storage/")

        cnn2_vars = vars(cnn2)
        del cnn2_vars["CNN"]
        cnn_vars = vars(cnn)
        del cnn_vars["CNN"]

        for item, value in cnn_vars.items():
            if not isinstance(value, np.ndarray):
                self.assertEqual(value, cnn2_vars[item])

        model = cnn.get_model(["CMV"])
        self.assertEqual(vars(cnn), model)

        shutil.rmtree(path)
Exemplo n.º 29
0
    def test_encode(self):
        path = EnvironmentSettings.root_path + "test/tmp/kmerfreqenc/"

        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects([ReceptorSequence("AAA", identifier="1"),
                                                       ReceptorSequence("ATA", identifier="2"),
                                                       ReceptorSequence("ATA", identifier='3')],
                                                      metadata={"l1": 1, "l2": 2, "subject_id": "1"}, path=path)

        rep2 = Repertoire.build_from_sequence_objects([ReceptorSequence("ATA", identifier="1"),
                                                       ReceptorSequence("TAA", identifier="2"),
                                                       ReceptorSequence("AAC", identifier="3")],
                                                      metadata={"l1": 0, "l2": 3, "subject_id": "2"}, path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        encoder = KmerFrequencyEncoder.build_object(dataset, **{
                "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.IDENTITY.name,
                "k": 3
            })

        d1 = encoder.encode(dataset, EncoderParams(
            result_path=path + "1/",
            label_config=lc,
            learn_model=True,
            model={},
            filename="dataset.pkl"
        ))

        encoder = KmerFrequencyEncoder.build_object(dataset, **{
                "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
                "k": 3
            })

        d2 = encoder.encode(dataset, EncoderParams(
            result_path=path + "2/",
            label_config=lc,
            pool_size=2,
            learn_model=True,
            model={},
            filename="dataset.csv"
        ))

        encoder3 = KmerFrequencyEncoder.build_object(dataset, **{
            "normalization_type": NormalizationType.BINARY.name,
            "reads": ReadsType.UNIQUE.name,
            "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
            "k": 3
        })

        d3 = encoder3.encode(dataset, EncoderParams(
            result_path=path + "3/",
            label_config=lc,
            learn_model=True,
            model={},
            filename="dataset.pkl"
        ))

        shutil.rmtree(path)

        self.assertTrue(isinstance(d1, RepertoireDataset))
        self.assertTrue(isinstance(d2, RepertoireDataset))
        self.assertEqual(0.67, np.round(d2.encoded_data.examples[0, 2], 2))
        self.assertEqual(0.0, np.round(d3.encoded_data.examples[0, 1], 2))
        self.assertTrue(isinstance(encoder, KmerFrequencyEncoder))
Exemplo n.º 30
0
    def test_encode(self):
        path = EnvironmentSettings.root_path + "test/tmp/evennessenc/"

        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=10))
            for i in range(1000)
        ] + [
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=100))
            for i in range(1000)
        ] + [
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=1))
            for i in range(1000)
        ],
                                                      metadata={
                                                          "l1": "test_1",
                                                          "l2": 2
                                                      },
                                                      path=path)

        rep2 = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=10))
            for i in range(1000)
        ],
                                                      metadata={
                                                          "l1": "test_2",
                                                          "l2": 3
                                                      },
                                                      path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", ["test_1", "test_2"])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        encoder = EvennessProfileEncoder.build_object(
            dataset, **{
                "min_alpha": 0,
                "max_alpha": 10,
                "dimension": 51
            })

        d1 = encoder.encode(
            dataset, EncoderParams(
                result_path=path + "1/",
                label_config=lc,
            ))

        encoder = EvennessProfileEncoder.build_object(
            dataset, **{
                "min_alpha": 0,
                "max_alpha": 10,
                "dimension": 11
            })

        d2 = encoder.encode(
            dataset,
            EncoderParams(result_path=path, label_config=lc, pool_size=2))

        self.assertAlmostEqual(d1.encoded_data.examples[0, 0], 1)
        self.assertAlmostEqual(d1.encoded_data.examples[0, 1], 0.786444)
        self.assertAlmostEqual(d1.encoded_data.examples[1, 0], 1)
        self.assertAlmostEqual(d1.encoded_data.examples[1, 1], 1)

        shutil.rmtree(path)