示例#1
0
    def create_dataset(self, path, dataset_size: int = 50):

        sequences = []

        for i in range(dataset_size):
            if i % 2 == 0:
                sequences.append(
                    ReceptorSequence(
                        amino_acid_sequence="AAACCC",
                        identifier=str(i),
                        metadata=SequenceMetadata(custom_params={"l1": 1})))
            else:
                sequences.append(
                    ReceptorSequence(
                        amino_acid_sequence="ACACAC",
                        identifier=str(i),
                        metadata=SequenceMetadata(custom_params={"l1": 2})))

        PathBuilder.build(path)
        filename = "{}sequences.pkl".format(path)
        with open(filename, "wb") as file:
            pickle.dump(sequences, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = SequenceDataset(params={"l1": [1, 2]},
                                  filenames=[filename],
                                  identifier="d1")
        return dataset
示例#2
0
    def test_encode_sequence(self):
        sequence = ReceptorSequence(
            amino_acid_sequence="AAA",
            metadata=SequenceMetadata(frame_type="OUT"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(
            enc.encode_sequence(
                sequence,
                EncoderParams(model={},
                              label_config=LabelConfiguration(),
                              result_path="")), ["AAA"])

        sequence = ReceptorSequence(
            amino_acid_sequence="AAA",
            metadata=SequenceMetadata(frame_type="STOP"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(
            enc.encode_sequence(
                sequence,
                EncoderParams(model={},
                              label_config=LabelConfiguration(),
                              result_path="")), ["AAA"])

        sequence = ReceptorSequence(amino_acid_sequence="AAA",
                                    metadata=SequenceMetadata(frame_type="IN"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(["AAA"],
                         enc.encode_sequence(
                             sequence,
                             EncoderParams(model={},
                                           label_config=LabelConfiguration(),
                                           result_path="")))
示例#3
0
    def _create_label_config(self, instruction: dict, dataset: Dataset,
                             instruction_key: str) -> LabelConfiguration:
        labels = instruction["labels"]

        self._check_label_format(labels, instruction_key)

        label_config = LabelConfiguration()
        for label in labels:
            label_name = label if isinstance(label, str) else list(
                label.keys())[0]
            positive_class = label[label_name]['positive_class'] if isinstance(
                label, dict) else None
            if dataset.params is not None and label_name in dataset.params:
                label_values = dataset.params[label_name]
            elif hasattr(dataset, "get_metadata"):
                label_values = list(
                    set(dataset.get_metadata([label_name])[label_name]))
            else:
                label_values = []
                warnings.warn(
                    f"{TrainMLModelParser.__name__}: for instruction {instruction_key}, label values could not be recovered for label "
                    f"{label}, using empty list instead.  This could cause problems with some encodings. "
                    f"If that might be the case, check if the dataset {dataset.name} has been properly loaded."
                )

            label_config.add_label(label_name,
                                   label_values,
                                   positive_class=positive_class)
        return label_config
示例#4
0
    def test(self):

        path = EnvironmentSettings.tmp_test_path + "integration_sequence_classification/"
        dataset = self.create_dataset(path)

        os.environ["cache_type"] = "test"
        encoder_params = {
            "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name,
            "reads": ReadsType.UNIQUE.name,
            "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
            "k": 3
        }

        hp_setting = HPSetting(encoder=KmerFrequencyEncoder.build_object(
            dataset, **encoder_params),
                               encoder_params=encoder_params,
                               ml_method=LogisticRegression(),
                               ml_params={
                                   "model_selection_cv": False,
                                   "model_selection_n_folds": -1
                               },
                               preproc_sequence=[])

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        instruction = TrainMLModelInstruction(
            dataset, GridSearch([hp_setting]), [hp_setting],
            SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()),
            SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()),
            {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, lc, path)

        result = instruction.run(result_path=path)

        shutil.rmtree(path)
    def test_encode_sequence(self):
        seq = ReceptorSequence(amino_acid_sequence="CASSVFRTY")
        result = KmerSequenceEncoder.encode_sequence(
            seq,
            EncoderParams(model={"k": 3},
                          label_config=LabelConfiguration(),
                          result_path="",
                          pool_size=4))

        self.assertTrue("CAS" in result)
        self.assertTrue("ASS" in result)
        self.assertTrue("SSV" in result)
        self.assertTrue("SVF" in result)
        self.assertTrue("VFR" in result)
        self.assertTrue("FRT" in result)
        self.assertTrue("RTY" in result)

        self.assertEqual(7, len(result))
        self.assertEqual(
            KmerSequenceEncoder.encode_sequence(
                ReceptorSequence(amino_acid_sequence="AC"),
                EncoderParams(model={"k": 3},
                              label_config=LabelConfiguration(),
                              result_path="",
                              pool_size=4)), None)
示例#6
0
    def __init__(self, split_index: int, train_val_dataset, test_dataset, path: str, label_configuration: LabelConfiguration):

        self.split_index = split_index
        self.train_val_dataset = train_val_dataset
        self.test_dataset = test_dataset
        self.path = path
        self.train_val_data_reports = []
        self.test_data_reports = []

        # computed
        self.label_states = {label: HPLabelState(label, label_configuration.get_auxiliary_labels(label))
                             for label in label_configuration.get_labels_by_name()}
示例#7
0
    def _construct_test_repertoiredataset(self, path, positional):
        receptors1 = ReceptorSequenceList()
        receptors2 = ReceptorSequenceList()

        if positional:
            [
                receptors1.append(seq) for seq in [
                    ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1"),
                    ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1")
                ]
            ]
            [
                receptors2.append(seq)
                for seq in [ReceptorSequence("TTTTTTTTTTTTT", identifier="1")]
            ]
        else:
            [
                receptors1.append(seq) for seq in [
                    ReceptorSequence("AAAA", identifier="1"),
                    ReceptorSequence("ATA", identifier="2"),
                    ReceptorSequence("ATA", identifier='3')
                ]
            ]
            [
                receptors2.append(seq) for seq in [
                    ReceptorSequence("ATA", identifier="1"),
                    ReceptorSequence("TAA", identifier="2")
                ]
            ]

        rep1 = Repertoire.build_from_sequence_objects(receptors1,
                                                      metadata={
                                                          "l1": 1,
                                                          "l2": 2,
                                                          "subject_id": "1"
                                                      },
                                                      path=path)

        rep2 = Repertoire.build_from_sequence_objects(receptors2,
                                                      metadata={
                                                          "l1": 0,
                                                          "l2": 3,
                                                          "subject_id": "2"
                                                      },
                                                      path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        return dataset, lc
示例#8
0
    def construct_test_flatten_dataset(self, path):
        sequences = [ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1", metadata=SequenceMetadata(custom_params={"l1": 1})),
                     ReceptorSequence(amino_acid_sequence="ATATAT", identifier="2", metadata=SequenceMetadata(custom_params={"l1": 2}))]

        PathBuilder.build(path)
        filename = "{}sequences.pkl".format(path)
        with open(filename, "wb") as file:
            pickle.dump(sequences, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        return SequenceDataset(params={"l1": [1, 2]}, filenames=[filename], identifier="d1")
    def test_run(self):
        path = EnvironmentSettings.tmp_test_path + "explanalysisprocintegration/"
        PathBuilder.build(path)
        os.environ["cache_type"] = "test"

        dataset = self.create_dataset(path)

        label_config = LabelConfiguration()
        label_config.add_label("l1", [0, 1])
        label_config.add_label("l2", [2, 3])

        file_content = """complex.id	Gene	CDR3	V	J	Species	MHC A	MHC B	MHC class	Epitope	Epitope gene	Epitope species	Reference	Method	Meta	CDR3fix	Score
        100a	TRA	AAAC	TRAV12	TRAJ1	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV
        """

        with open(path + "refs.tsv", "w") as file:
            file.writelines(file_content)

        refs = {
            "params": {
                "path": path + "refs.tsv",
                "region_type": "FULL_SEQUENCE"
            },
            "format": "VDJdb"
        }

        units = {
            "named_analysis_4":
            ExploratoryAnalysisUnit(
                dataset=dataset,
                report=DesignMatrixExporter(),
                label_config=label_config,
                encoder=MatchedSequencesRepertoireEncoder.build_object(
                    dataset, **{
                        "max_edit_distance": 1,
                        "reference": refs
                    }))
        }

        process = ExploratoryAnalysisInstruction(units, name="exp")
        process.run(path + "results/")

        self.assertTrue(
            os.path.isfile(
                path +
                "results/exp/analysis_named_analysis_4/report/design_matrix.csv"
            ))

        shutil.rmtree(path)
示例#10
0
    def test_sequence_flattened(self):
        path = EnvironmentSettings.root_path + "test/tmp/onehot_seq_flat/"

        PathBuilder.build(path)

        dataset = self.construct_test_flatten_dataset(path)

        encoder = OneHotEncoder.build_object(dataset, **{"use_positional_info": False, "distance_to_seq_middle": None, "flatten": True})

        encoded_data = encoder.encode(dataset, EncoderParams(
            result_path=path,
            label_config=LabelConfiguration([Label(name="l1", values=[1, 0], positive_class="1")]),
            pool_size=1,
            learn_model=True,
            model={},
            filename="dataset.pkl"
        ))

        self.assertTrue(isinstance(encoded_data, SequenceDataset))

        onehot_a = [1.0] + [0.0] * 19
        onehot_t = [0.0] * 16 + [1.0] + [0] * 3

        self.assertListEqual(list(encoded_data.encoded_data.examples[0]), onehot_a+onehot_a+onehot_a+onehot_t+onehot_t+onehot_t)
        self.assertListEqual(list(encoded_data.encoded_data.examples[1]), onehot_a+onehot_t+onehot_a+onehot_t+onehot_a+onehot_t)

        self.assertListEqual(list(encoded_data.encoded_data.feature_names), [f"{pos}_{char}" for pos in range(6) for char in EnvironmentSettings.get_sequence_alphabet()])
        shutil.rmtree(path)
示例#11
0
    def test_encode(self):
        path = EnvironmentSettings.tmp_test_path + "abundance_encoder/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build([["GGG", "III", "LLL", "MMM"],
                                                         ["DDD", "EEE", "FFF", "III", "LLL", "MMM"],
                                                         ["CCC", "FFF", "MMM"],
                                                         ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]],
                                                        labels={"l1": [True, True, False, False]}, path=path)

        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, identifier="1")

        encoder = SequenceAbundanceEncoder.build_object(dataset, **{
            "comparison_attributes": ["sequence_aas"],
            "p_value_threshold": 0.4, "sequence_batch_size": 4, "repertoire_batch_size": 8
        })

        label_config = LabelConfiguration([Label("l1", [True, False], positive_class=True)])

        encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config))

        self.assertTrue(np.array_equal(np.array([[1, 4], [1, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples))

        encoder.p_value_threshold = 0.05

        encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config))

        self.assertTrue(np.array_equal(np.array([[0, 4], [0, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples))

        shutil.rmtree(path)
    def _prepare_optional_params(self, analysis: dict,
                                 symbol_table: SymbolTable) -> dict:

        params = {}
        dataset = symbol_table.get(analysis["dataset"])

        if all(key in analysis for key in ["encoding", "labels"]):
            params["encoder"] = symbol_table.get(analysis["encoding"]) \
                .build_object(dataset, **symbol_table.get_config(analysis["encoding"])["encoder_params"])
            params["label_config"] = LabelConfiguration()
            for label in analysis["labels"]:
                label_values = self._get_label_values(label, dataset)
                params["label_config"].add_label(label, label_values)
        elif any(key in analysis for key in ["encoding", "labels"]):
            raise KeyError(
                "ExploratoryAnalysisParser: keys for analyses are not properly defined. "
                "If encoding is defined, labels have to be defined as well and vice versa."
            )

        if "preprocessing_sequence" in analysis:
            params["preprocessing_sequence"] = symbol_table.get(
                analysis["preprocessing_sequence"])

        if "number_of_processes" in analysis:
            params["number_of_processes"] = analysis["number_of_processes"]

        return params
示例#13
0
    def test_run(self):
        path = EnvironmentSettings.root_path + "test/tmp/dataencoder/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects(
            [ReceptorSequence("AAA", identifier="1")],
            metadata={
                "l1": 1,
                "l2": 2
            },
            path=path)

        rep2 = Repertoire.build_from_sequence_objects(
            [ReceptorSequence("ATA", identifier="2")],
            metadata={
                "l1": 0,
                "l2": 3
            },
            path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])
        encoder = Word2VecEncoder.build_object(
            dataset, **{
                "k": 3,
                "model_type": ModelType.SEQUENCE.name,
                "vector_size": 6
            })

        res = DataEncoder.run(
            DataEncoderParams(dataset=dataset,
                              encoder=encoder,
                              encoder_params=EncoderParams(
                                  model={},
                                  pool_size=2,
                                  label_config=lc,
                                  result_path=path,
                                  filename="dataset.csv"),
                              store_encoded_data=False))

        self.assertTrue(isinstance(res, RepertoireDataset))
        self.assertTrue(res.encoded_data.examples.shape[0] == 2)

        shutil.rmtree(path)
示例#14
0
    def test_encode(self):

        test_path = EnvironmentSettings.root_path + "test/tmp/w2v/"

        PathBuilder.build(test_path)

        sequence1 = ReceptorSequence("CASSVFA", identifier="1")
        sequence2 = ReceptorSequence("CASSCCC", identifier="2")

        metadata1 = {"T1D": "T1D", "subject_id": "1"}
        rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2],
                                                      test_path, metadata1)

        metadata2 = {"T1D": "CTL", "subject_id": "2"}
        rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path,
                                                      metadata2)

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        label_configuration = LabelConfiguration()
        label_configuration.add_label("T1D", ["T1D", "CTL"])

        config_params = EncoderParams(model={},
                                      learn_model=True,
                                      result_path=test_path,
                                      label_config=label_configuration,
                                      filename="dataset.pkl")

        encoder = Word2VecEncoder.build_object(
            dataset, **{
                "k": 3,
                "model_type": "sequence",
                "vector_size": 16
            })

        encoded_dataset = encoder.encode(dataset=dataset, params=config_params)

        self.assertIsNotNone(encoded_dataset.encoded_data)
        self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == 2)
        self.assertTrue(encoded_dataset.encoded_data.examples.shape[1] == 16)
        self.assertTrue(len(encoded_dataset.encoded_data.labels["T1D"]) == 2)
        self.assertTrue(encoded_dataset.encoded_data.labels["T1D"][0] == "T1D")
        self.assertTrue(isinstance(encoder, W2VRepertoireEncoder))

        shutil.rmtree(test_path)
    def test_run(self):

        path = EnvironmentSettings.tmp_test_path + "mlapplicationtest/"
        PathBuilder.build(path)

        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            50, {5: 1}, {5: 1}, {"l1": {
                1: 0.5,
                2: 0.5
            }}, path + 'dataset/')
        ml_method = LogisticRegression()
        encoder = KmerFreqRepertoireEncoder(
            NormalizationType.RELATIVE_FREQUENCY,
            ReadsType.UNIQUE,
            SequenceEncodingType.CONTINUOUS_KMER,
            3,
            scale_to_zero_mean=True,
            scale_to_unit_variance=True)
        label_config = LabelConfiguration([Label("l1", [1, 2])])

        enc_dataset = encoder.encode(
            dataset,
            EncoderParams(result_path=path,
                          label_config=label_config,
                          filename="tmp_enc_dataset.pickle",
                          pool_size=4))
        ml_method.fit(enc_dataset.encoded_data, 'l1')

        hp_setting = HPSetting(
            encoder, {
                "normalization_type": "relative_frequency",
                "reads": "unique",
                "sequence_encoding": "continuous_kmer",
                "k": 3,
                "scale_to_zero_mean": True,
                "scale_to_unit_variance": True
            }, ml_method, {}, [], 'enc1', 'ml1')

        PathBuilder.build(path + 'result/instr1/')
        shutil.copy(path + 'dict_vectorizer.pickle',
                    path + 'result/instr1/dict_vectorizer.pickle')
        shutil.copy(path + 'scaler.pickle',
                    path + 'result/instr1/scaler.pickle')

        ml_app = MLApplicationInstruction(dataset, label_config, hp_setting, 4,
                                          "instr1", False)
        ml_app.run(path + 'result/')

        predictions_path = path + "result/instr1/predictions.csv"
        self.assertTrue(os.path.isfile(predictions_path))

        df = pd.read_csv(predictions_path)
        self.assertEqual(50, df.shape[0])

        shutil.rmtree(path)
示例#16
0
    def test_encode(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path + "atchley_kmer_encoding/")
        dataset = RandomDatasetGenerator.generate_repertoire_dataset(3, {1: 1}, {4: 1}, {"l1": {True: 0.4, False: 0.6}}, path + "dataset/")

        encoder = AtchleyKmerEncoder.build_object(dataset, **{"k": 2, "skip_first_n_aa": 1, "skip_last_n_aa": 1, "abundance": "RELATIVE_ABUNDANCE",
                                                              "normalize_all_features": False})
        encoded_dataset = encoder.encode(dataset, EncoderParams(path + "result/", LabelConfiguration(labels=[Label("l1")])))

        self.assertEqual((3, 11, 3), encoded_dataset.encoded_data.examples.shape)
        self.assertEqual(0., encoded_dataset.encoded_data.examples[0, -1, 0])

        shutil.rmtree(path)
示例#17
0
    def run_setting(state: TrainMLModelState, hp_setting, train_dataset, val_dataset, split_index: int,
                    current_path: str, label: str, assessment_index: int):

        hp_item = MLProcess(train_dataset=train_dataset, test_dataset=val_dataset, encoding_reports=state.selection.reports.encoding_reports.values(),
                            label_config=LabelConfiguration([state.label_configuration.get_label_object(label)]), report_context=state.context,
                            number_of_processes=state.number_of_processes, metrics=state.metrics, optimization_metric=state.optimization_metric,
                            ml_reports=state.selection.reports.model_reports.values(), label=label, path=current_path, hp_setting=hp_setting,
                            store_encoded_data=state.store_encoded_data)\
            .run(split_index)

        state.assessment_states[assessment_index].label_states[label].selection_state.hp_items[hp_setting.get_key()].append(hp_item)

        return hp_item.performance[state.optimization_metric.name.lower()] if hp_item.performance is not None else None
示例#18
0
    def test_run(self):
        path = EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/"
        PathBuilder.build(path)
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build([["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"]], path)[0])
        dataset.encoded_data = EncodedData(
            examples=np.array([[1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3]]),
            labels={"l1": [1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3], "l2": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]}
        )

        label_config = LabelConfiguration()
        label_config.add_label("l1", [1, 3])

        method1 = LogisticRegression()
        method1.fit(dataset.encoded_data, label_name='l1')

        res = MLMethodAssessment.run(MLMethodAssessmentParams(
            dataset=dataset,
            method=method1,
            metrics={Metric.ACCURACY, Metric.BALANCED_ACCURACY, Metric.F1_MACRO},
            optimization_metric=Metric.LOG_LOSS,
            predictions_path=EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/predictions.csv",
            label="l1",
            ml_score_path=EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/ml_score.csv",
            split_index=1,
            path=EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/"
        ))

        self.assertTrue(isinstance(res, dict))
        self.assertTrue(res[Metric.LOG_LOSS.name.lower()] <= 0.1)

        self.assertTrue(os.path.isfile(EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/ml_score.csv"))

        df = pd.read_csv(EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/ml_score.csv")
        self.assertTrue(df.shape[0] == 1)

        df = pd.read_csv(EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/predictions.csv")
        self.assertEqual(12, df.shape[0])

        shutil.rmtree(EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/")
示例#19
0
    def test_run(self):

        path = EnvironmentSettings.root_path + "test/tmp/smmodel/"
        PathBuilder.build(path)
        repertoires, metadata = RepertoireBuilder.build([["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"]], path,
                                                      {"default": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                                                                   1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2]})
        dataset = RepertoireDataset(repertoires=repertoires,
                                    params={"default": [1, 2]},
                                    metadata_file=metadata)

        label_config = LabelConfiguration()
        label_config.add_label("default", [1, 2])

        hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **{"vector_size": 8, "model_type": ModelType.SEQUENCE.name, "k": 3}),
                                 {"vector_size": 8, "model_type": ModelType.SEQUENCE.name, "k": 3},
                                 LogisticRegression(),
                                 {"model_selection_cv": False, "model_selection_n_folds": -1}, [])]

        split_config_assessment = SplitConfig(SplitType.RANDOM, 1, 0.5, ReportConfig())
        split_config_selection = SplitConfig(SplitType.RANDOM, 1, 0.5, ReportConfig())

        instruction = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings,
                                              split_config_assessment,
                                              split_config_selection,
                                              {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY,
                                              label_config, path)
        semantic_model = SemanticModel([instruction], path)

        semantic_model.run()

        shutil.rmtree(path)
    def test_encode_sequence(self):
        sequence = ReceptorSequence("AHCDE", None, None)
        kmers = IMGTGappedKmerEncoder.encode_sequence(
            sequence,
            EncoderParams(model={
                "k_left": 1,
                "max_gap": 1
            },
                          label_config=LabelConfiguration(),
                          result_path=""))

        self.assertEqual(
            {
                'AH///105', 'HC///106', 'CD///107', 'DE///116', 'A.C///105',
                'H.D///106', 'C.E///107'
            }, set(kmers))

        sequence = ReceptorSequence("CASSPRERATYEQCAY", None, None)
        kmers = IMGTGappedKmerEncoder.encode_sequence(
            sequence,
            EncoderParams(model={
                "k_left": 1,
                "max_gap": 1
            },
                          label_config=LabelConfiguration(),
                          result_path=""))

        self.assertEqual(
            {
                'CA///105', 'AS///106', 'SS///107', 'SP///108', 'PR///109',
                'RE///110', 'ER///111', 'RA///111.001', 'AT///112.002',
                'TY///112.001', 'YE///112', 'EQ///113', 'QC///114', 'CA///115',
                'AY///116', 'C.S///105', 'A.S///106', 'S.P///107', 'S.R///108',
                'P.E///109', 'R.R///110', 'E.A///111', 'R.T///111.001',
                'A.Y///112.002', 'T.E///112.001', 'Y.Q///112', 'E.C///113',
                'Q.A///114', 'C.Y///115'
            }, set(kmers))
    def test_encode_sequence(self):
        sequence = ReceptorSequence("CASSPRERATYEQCASSPRERATYEQCASSPRERATYEQ", None, None)
        result = IMGTKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(
                                                                    model={"k": 3},
                                                                    label_config=LabelConfiguration(),
                                                                    result_path=""))

        self.assertEqual({'CAS///105', 'ASS///106', 'SSP///107', 'SPR///108', 'PRE///109', 'RER///110', 'ERA///111',
                          'RAT///111.001', 'ATY///111.002', 'TYE///111.003', 'YEQ///111.004', 'EQC///111.005',
                          'QCA///111.006', 'CAS///111.007', 'ASS///111.008', 'SSP///111.009', 'SPR///111.01',
                          'PRE///111.011', 'RER///111.012', 'ERA///111.013', 'RAT///112.013', 'ATY///112.012',
                          'TYE///112.011', 'YEQ///112.01', 'EQC///112.009', 'QCA///112.008', 'CAS///112.007',
                          'ASS///112.006', 'SSP///112.005', 'SPR///112.004', 'PRE///112.003', 'RER///112.002',
                          'ERA///112.001', 'RAT///112', 'ATY///113', 'TYE///114', 'YEQ///115'},
                         set(result))

        self.assertEqual(len(result), len(sequence.get_sequence()) - 3 + 1)

        sequence = ReceptorSequence("AHCDE", None, None)
        result = IMGTKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(
                                                                    model={"k": 3},
                                                                    label_config=LabelConfiguration(),
                                                                    result_path=""))

        self.assertEqual({'AHC///105', 'HCD///106', 'CDE///107'},
                         set(result))

        self.assertEqual(len(result), len(sequence.get_sequence()) - 3 + 1)
        self.assertEqual(
            IMGTKmerSequenceEncoder.encode_sequence(
                              sequence,
                              EncoderParams(model={"k": 25},
                                            label_config=LabelConfiguration(),
                                            result_path="")
            ),
            None
        )
    def test_generate(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path +
                                 "tcrdist_motif_discovery/")
        dataset_path = self._create_dataset(path)

        dataset = SingleLineReceptorImport.import_dataset(
            {
                "path":
                dataset_path,
                "result_path":
                path + "dataset/",
                "separator":
                ",",
                "columns_to_load": [
                    "subject", "epitope", "count", "v_a_gene", "j_a_gene",
                    "cdr3_a_aa", "v_b_gene", "j_b_gene", "cdr3_b_aa",
                    "clone_id", "cdr3_a_nucseq", "cdr3_b_nucseq"
                ],
                "column_mapping": {
                    "cdr3_a_aa": "alpha_amino_acid_sequence",
                    "cdr3_b_aa": "beta_amino_acid_sequence",
                    "cdr3_a_nucseq": "alpha_nucleotide_sequence",
                    "cdr3_b_nucseq": "beta_nucleotide_sequence",
                    "v_a_gene": "alpha_v_gene",
                    "v_b_gene": "beta_v_gene",
                    "j_a_gene": "alpha_j_gene",
                    "j_b_gene": "beta_j_gene",
                    "clone_id": "identifier"
                },
                "receptor_chains":
                "TRA_TRB",
                "region_type":
                "IMGT_CDR3",
                "sequence_file_size":
                50000,
                "organism":
                "mouse"
            }, 'd1')

        dataset = TCRdistEncoder(8).encode(
            dataset,
            EncoderParams(f"{path}result/",
                          LabelConfiguration([Label("epitope")])))

        report = TCRdistMotifDiscovery(dataset, path + "report/",
                                       "report name", 8)
        report.generate_report()

        shutil.rmtree(path)
示例#23
0
    def _construct_test_dataset(self, path, dataset_size: int = 50):
        receptors = [
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAA"),
                         beta=ReceptorSequence(amino_acid_sequence="ATA"),
                         metadata={"l1": 1},
                         identifier=str("1")),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="ATA"),
                         beta=ReceptorSequence(amino_acid_sequence="ATT"),
                         metadata={"l1": 2},
                         identifier=str("2"))
        ]

        PathBuilder.build(path)
        filename = "{}receptors.pkl".format(path)
        with open(filename, "wb") as file:
            pickle.dump(receptors, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = ReceptorDataset(params={"l1": [1, 2]},
                                  filenames=[filename],
                                  identifier="d1")
        return dataset, lc
示例#24
0
    def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: str) -> MLApplicationInstruction:
        location = MLApplicationParser.__name__
        ParameterValidator.assert_keys(instruction.keys(), ['type', 'dataset', 'label', 'pool_size', 'config_path', 'store_encoded_data'], location, key)
        ParameterValidator.assert_in_valid_list(instruction['dataset'], symbol_table.get_keys_by_type(SymbolType.DATASET), location, f"{key}: dataset")
        ParameterValidator.assert_type_and_value(instruction['pool_size'], int, location, f"{key}: pool_size", min_inclusive=1)
        ParameterValidator.assert_type_and_value(instruction['label'], str, location, f'{key}: label')
        ParameterValidator.assert_type_and_value(instruction['config_path'], str, location, f'{key}: config_path')
        ParameterValidator.assert_type_and_value(instruction['store_encoded_data'], bool, location, f'{key}: store_encoded_data')

        hp_setting, label = self._parse_hp_setting(instruction, path, key)

        instruction = MLApplicationInstruction(dataset=symbol_table.get(instruction['dataset']), name=key, pool_size=instruction['pool_size'],
                                               label_configuration=LabelConfiguration([label]), hp_setting=hp_setting,
                                               store_encoded_data=instruction['store_encoded_data'])

        return instruction
示例#25
0
def encode_dataset_by_kmer_freq(path_to_dataset_directory: str, result_path: str, metadata_path: str = None):
    """
    encodes the repertoire dataset using KmerFrequencyEncoder
    :param path_to_dataset_directory: path to directory containing all repertoire files with .tsv extension in MiXCR format
    :param result_path: where to store the results
    :param metadata_path: csv file with columns "filename", "subject_id", "disease" which is filled by default if value of argument is None,
                          otherwise any metadata csv file passed to the function, must include filename and subject_id columns,
                          and an arbitrary disease column
    :return: encoded dataset with encoded data in encoded_dataset.encoded_data.examples
    """
    if metadata_path is None:
        metadata_path = generate_random_metadata(path_to_dataset_directory, result_path)

    loader = MiXCRImport()
    dataset = loader.import_dataset({
        "is_repertoire": True,
        "path": path_to_dataset_directory,
        "metadata_file": metadata_path,
        "region_type": "IMGT_CDR3",  # import_dataset in only cdr3
        "number_of_processes": 4,  # number of parallel processes for loading the data
        "result_path": result_path,
        "separator": "\t",
        "columns_to_load": ["cloneCount", "allVHitsWithScore", "allJHitsWithScore", "aaSeqCDR3", "nSeqCDR3"],
        "column_mapping": {
            "cloneCount": "counts",
            "allVHitsWithScore": "v_genes",
            "allJHitsWithScore": "j_genes"
        },
    }, "mixcr_dataset")

    label_name = list(dataset.params.keys())[0]  # label that can be used for ML prediction - by default: "disease" with values True/False

    encoded_dataset = DataEncoder.run(DataEncoderParams(dataset, KmerFrequencyEncoder.build_object(dataset, **{
        "normalization_type": "relative_frequency",  # encode repertoire by the relative frequency of k-mers in repertoire
        "reads": "unique",  # count each sequence only once, do not use clonal count
        "k": 2,  # k-mer length
        "sequence_encoding": "continuous_kmer"  # split each sequence in repertoire to overlapping k-mers
    }), EncoderParams(result_path=result_path,
                      label_config=LabelConfiguration([Label(label_name, dataset.params[label_name])])), False))

    dataset_exporter = DesignMatrixExporter(dataset=encoded_dataset,
                                            result_path=f"{result_path if result_path[:-1] == '/' else result_path+'/'}csv_exported/")
    dataset_exporter.generate_report()

    return encoded_dataset
    def _create_state_object(self, path):
        repertoires, metadata = RepertoireBuilder.build(sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]],
                                                        path=path,
                                                        labels={
                                                            "l1": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                                                                   1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
                                                            "l2": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
                                                                   0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]})

        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata,
                                    params={"l1": [1, 2], "l2": [0, 1]})
        enc_params = {"k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 4}
        hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **enc_params), enc_params,
                                 LogisticRegression(),
                                 {"model_selection_cv": False, "model_selection_n_folds": -1},
                                 [])]

        label_config = LabelConfiguration([Label("l1", [1, 2]), Label("l2", [0, 1])])

        process = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings,
                                          SplitConfig(SplitType.RANDOM, 1, 0.5),
                                          SplitConfig(SplitType.RANDOM, 1, 0.5),
                                          {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path)

        state = process.run(result_path=path)

        return state
示例#27
0
    def create_dummy_data(self, path):
        # Setting up dummy data
        labels = {
            "subject_id": ["subject_1", "subject_2", "subject_3"],
            "label": ["yes", "yes", "no"]
        }

        metadata = {
            "v_gene": "TRBV1",
            "j_gene": "TRBJ1",
            "chain": Chain.BETA.value
        }

        repertoires, metadata = RepertoireBuilder.build(
            sequences=[["AAAA"], ["SSSS"], ["SSSS", "CCCC"]],
            path=path,
            labels=labels,
            seq_metadata=[[{
                **metadata, "count": 10
            }], [{
                **metadata, "count": 10
            }], [{
                **metadata, "count": 5
            }, {
                **metadata, "count": 5
            }]],
            subject_ids=labels["subject_id"])

        dataset = RepertoireDataset(repertoires=repertoires)

        label_config = LabelConfiguration()
        label_config.add_label("subject_id", labels["subject_id"])
        label_config.add_label("label", labels["label"])

        file_content = """complex.id	Gene	CDR3	V	J	Species	MHC A	MHC B	MHC class	Epitope	Epitope gene	Epitope species	Reference	Method	Meta	CDR3fix	Score
100	TRB	AAAA	TRBV1	TRBJ1	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"}	0
200	TRB	SSSS	TRBV1	TRBJ1	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"}	0"""

        with open(path + "refs.tsv", "w") as file:
            file.writelines(file_content)

        reference_sequences = {
            "params": {
                "path": path + "refs.tsv",
                "region_type": "FULL_SEQUENCE"
            },
            "format": "VDJdb"
        }

        return dataset, label_config, reference_sequences, labels
示例#28
0
    def test_encode(self):

        file_content = """complex.id	Gene	CDR3	V	J	Species	MHC A	MHC B	MHC class	Epitope	Epitope gene	Epitope species	Reference	Method	Meta	CDR3fix	Score
3050	TRB	CASSPPRVYSNGAGLAGVGWRNEQFF	TRBV5-4*01	TRBJ2-1*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"}	0
15760	TRB	CASSWTWDAATLWGQGALGGANVLTF	TRBV5-5*01	TRBJ2-6*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"}	0
3050	TRA	CAAIYESRGSTLGRLYF	TRAV13-1*01	TRAJ18*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"}	0
15760	TRA	CALRLNNQGGKLIF	TRAV9-2*01	TRAJ23*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"}	0
3051	TRB	CASSPPRVYSNGAGLAGVGWRNEQFF	TRBV5-4*01	TRBJ2-1*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"}	0
15761	TRB	CASSWTWDAATLWGQGALGGANVLTF	TRBV5-5*01	TRBJ2-6*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"}	0
3051	TRA	CAAIYESRGSTLGRLYF	TRAV13-1*01	TRAJ18*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"}	0
15761	TRA	CALRLNNQGGKLIF	TRAV9-2*01	TRAJ23*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"}	0
        """
        path = PathBuilder.build(EnvironmentSettings.root_path +
                                 "test/tmp/trcdist_encoder/")

        with open(path + "receptors.tsv", "w") as file:
            file.writelines(file_content)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/", "vdjdb")
        params["is_repertoire"] = False
        params["paired"] = True
        params["result_path"] = path
        params["path"] = path
        params["sequence_file_size"] = 1
        params["receptor_chains"] = "TRA_TRB"
        params['organism'] = 'human'

        dataset = VDJdbImport.import_dataset(params, "vdjdb_dataset")

        encoder = TCRdistEncoder.build_object(dataset, **{"cores": 2})
        encoded_dataset = encoder.encode(
            dataset,
            EncoderParams(f"{path}result/",
                          LabelConfiguration([Label("epitope")])))

        self.assertTrue(encoded_dataset.encoded_data.examples.shape[0]
                        == encoded_dataset.encoded_data.examples.shape[1]
                        and encoded_dataset.encoded_data.examples.shape[0]
                        == dataset.get_example_count())

        shutil.rmtree(path)
示例#29
0
    def test_encode(self):

        path = EnvironmentSettings.tmp_test_path + "count_encoder/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build(
            [["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]],
            labels={"l1": [True, True, False, False]},
            path=path)

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    identifier="1")

        encoder = SequenceCountEncoder.build_object(
            dataset, **{
                "comparison_attributes": ["sequence_aas"],
                "p_value_threshold": 0.4,
                "sequence_batch_size": 4
            })

        label_config = LabelConfiguration(
            [Label("l1", [True, False], positive_class=True)])

        encoded_dataset = encoder.encode(
            dataset, EncoderParams(result_path=path,
                                   label_config=label_config))

        test = encoded_dataset.encoded_data.examples

        self.assertTrue(test[0] == 1)
        self.assertTrue(test[1] == 1)
        self.assertTrue(test[2] == 0)
        self.assertTrue(test[3] == 0)

        self.assertTrue("III" in encoded_dataset.encoded_data.feature_names)

        shutil.rmtree(path)
示例#30
0
    def test_encode(self):
        path = EnvironmentSettings.tmp_test_path + "distance_encoder/"
        PathBuilder.build(path)

        dataset = self.create_dataset(path)

        enc = DistanceEncoder.build_object(
            dataset, **{
                "distance_metric": DistanceMetricType.JACCARD.name,
                "attributes_to_match": ["sequence_aas"],
                "sequence_batch_size": 20
            })

        enc.set_context({"dataset": dataset})
        encoded = enc.encode(
            dataset,
            EncoderParams(result_path=path,
                          label_config=LabelConfiguration(
                              [Label("l1", [0, 1]),
                               Label("l2", [2, 3])]),
                          pool_size=4,
                          filename="dataset.pkl"))

        self.assertEqual(8, encoded.encoded_data.examples.shape[0])
        self.assertEqual(8, encoded.encoded_data.examples.shape[1])

        self.assertEqual(1, encoded.encoded_data.examples.iloc[0, 0])
        self.assertEqual(1, encoded.encoded_data.examples.iloc[1, 1])
        self.assertEqual(1, encoded.encoded_data.examples.iloc[0, 4])

        self.assertTrue(
            np.array_equal([1, 0, 1, 0, 1, 0, 1, 0],
                           encoded.encoded_data.labels["l1"]))
        self.assertTrue(
            np.array_equal([2, 3, 2, 3, 2, 3, 3, 3],
                           encoded.encoded_data.labels["l2"]))

        shutil.rmtree(path)