예제 #1
0
    def test_parse_ml_methods(self):

        params = {
            "LR1": {
                "LogisticRegression": {
                    "max_iter": 1000,
                    "penalty": "l1",
                }
            },
            "LR2": "LogisticRegression",
            "SVM1": {
                "SVM": {
                    "max_iter": [1000, 2000],
                    "penalty": ["l1", "l2"]
                },
                "model_selection_cv": True,
                "model_selection_n_folds": 5
            },
            "SVM2": {
                "SVM": {},
                "model_selection_cv": False,
                "model_selection_n_folds": -1
            }
        }

        symbol_table = SymbolTable()
        symbol_table, desc = MLParser.parse(params, symbol_table)
        self.assertTrue(symbol_table.get("SVM1")._parameter_grid is not None and len(symbol_table.get("SVM1")._parameter_grid["max_iter"]) == 2)
        self.assertTrue(symbol_table.get("LR1")._parameters is not None and symbol_table.get("LR1")._parameters["penalty"] == "l1")
        self.assertTrue(isinstance(symbol_table.get("LR2"), LogisticRegression))

        self.assertTrue("SVM" in desc["SVM1"].keys())
예제 #2
0
 def test_parse_reports(self):
     reports = {"r1": {"SequenceLengthDistribution": {}}}
     symbol_table = SymbolTable()
     symbol_table, specs = ReportParser.parse_reports(reports, symbol_table)
     self.assertTrue(symbol_table.contains("r1"))
     self.assertTrue(
         isinstance(symbol_table.get("r1"), SequenceLengthDistribution))
예제 #3
0
    def test_parse(self):
        workflow_specs = {
            "seq1": [{
                "filter_chain_B": {
                    "ChainRepertoireFilter": {
                        "keep_chain": "A"
                    }
                }
            }],
            "seq2": [{
                "filter_chain_A": {
                    "ChainRepertoireFilter": {
                        "keep_chain": "B"
                    }
                }
            }]
        }
        symbol_table = SymbolTable()
        table, specs = PreprocessingParser.parse(workflow_specs, symbol_table)

        self.assertTrue(table.contains("seq1"))
        self.assertTrue(table.contains("seq2"))
        self.assertTrue(
            isinstance(table.get("seq1"), list)
            and len(table.get("seq1")) == 1)
        self.assertEqual(list(workflow_specs.keys()), list(specs.keys()))
예제 #4
0
    def test_parse_receptor_dataset(self):
        file_content = """complex.id	Gene	CDR3	V	J	Species	MHC A	MHC B	MHC class	Epitope	Epitope gene	Epitope species	Reference	Method	Meta	CDR3fix	Score
3050	TRB	CASSPPRVYSNGAGLAGVGWRNEQFF	TRBV5-4*01	TRBJ2-1*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"}	0
15760	TRB	CASSWTWDAATLWGQGALGGANVLTF	TRBV5-5*01	TRBJ2-6*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"}	0
3050	TRA	CAAIYESRGSTLGRLYF	TRAV13-1*01	TRAJ18*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"}	0
15760	TRA	CALRLNNQGGKLIF	TRAV9-2*01	TRAJ23*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"}	0
                """
        path = EnvironmentSettings.root_path / "test/tmp/dslimportparservdj/"
        data_path = EnvironmentSettings.root_path / "test/tmp/dslimportparservdj/receptor_data/"
        PathBuilder.build(data_path)

        with open(data_path / "receptors.tsv", "w") as file:
            file.writelines(file_content)

        st, desc = ImportParser.parse(
            {
                "datasets": {
                    "d1": {
                        "format": "VDJdb",
                        "params": {
                            "is_repertoire": False,
                            "paired": True,
                            "receptor_chains": "TRA_TRB",
                            "path": data_path
                        }
                    }
                }
            }, SymbolTable(), path)

        dataset = st.get("d1")
        self.assertTrue(isinstance(dataset, ReceptorDataset))
        self.assertEqual(2, dataset.get_example_count())

        shutil.rmtree(path)
    def test_parse(self):
        specs = {
            "type": "DatasetExport",
            "export_formats": ["Pickle", "AIRR"],
            "datasets": ["d1"]
        }

        symbol_table = SymbolTable()
        symbol_table.add("d1", SymbolType.DATASET, RepertoireDataset())

        instruction = DatasetExportParser().parse("instr1", specs,
                                                  symbol_table)

        self.assertTrue(isinstance(instruction, DatasetExportInstruction))
        self.assertEqual(2, len(instruction.exporters))
        self.assertEqual(1, len(instruction.datasets))
    def test_parse(self):

        path = PathBuilder.build(
            f'{EnvironmentSettings.tmp_test_path}subsampling_parser/')
        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            30, {3: 1}, {2: 1}, {}, path)

        symbol_table = SymbolTable()
        symbol_table.add("d1", SymbolType.DATASET, dataset)

        SubsamplingParser().parse(
            'inst1', {
                'dataset': 'd1',
                'type': 'Subsampling',
                'subsampled_dataset_sizes': [10, 20],
                'dataset_export_formats': ['Pickle']
            }, symbol_table)

        with self.assertRaises(AssertionError):
            SubsamplingParser().parse(
                'inst1', {
                    'dataset': 'd1',
                    'type': 'Subsampling',
                    'subsampled_dataset_sizes': [10, 50],
                    'dataset_export_formats': ['Pickle']
                }, symbol_table)

        with self.assertRaises(AssertionError):
            SubsamplingParser().parse(
                'inst1', {
                    'dataset': 'd2',
                    'type': 'Subsampling',
                    'subsampled_dataset_sizes': [10, 20],
                    'dataset_export_formats': ['Pickle']
                }, symbol_table)

        with self.assertRaises(AssertionError):
            SubsamplingParser().parse(
                'inst1', {
                    'dataset': 'd2',
                    'type': 'Subsampling',
                    'subsampled_dataset_sizes': [10, 20],
                    'dataset_export_formats': ['Random']
                }, symbol_table)

        shutil.rmtree(path)
예제 #7
0
    def parse(workflow_specification: dict, file_path, result_path):

        symbol_table = SymbolTable()

        def_parser_output, specs_defs = DefinitionParser.parse(
            workflow_specification, symbol_table, result_path)
        symbol_table, specs_instructions = InstructionParser.parse(
            def_parser_output, result_path)
        app_output = OutputParser.parse(workflow_specification, symbol_table)

        path = ImmuneMLParser._output_specs(file_path=file_path,
                                            result_path=result_path,
                                            definitions=specs_defs,
                                            instructions=specs_instructions,
                                            output=app_output)

        return symbol_table, path
예제 #8
0
    def _extract_reports(self):
        with self.specification_path.open("r") as file:
            workflow_specification = yaml.safe_load(file)

        report_keys = list(workflow_specification['instructions'].values()
                           )[0]['benchmark_reports']

        ParameterValidator.assert_all_in_valid_list(
            report_keys,
            list(workflow_specification['definitions']['reports'].keys()),
            MultiDatasetBenchmarkTool.__name__, "benchmark_reports")

        reports = {
            key: value
            for key, value in workflow_specification['definitions']
            ['reports'].items() if key in report_keys
        }
        symbol_table, _ = ReportParser.parse_reports(reports, SymbolTable())
        self.reports = [
            entry.item for entry in symbol_table.get_by_type(SymbolType.REPORT)
        ]
예제 #9
0
    def test_parse_simulation(self):

        simulation = {
            "sim1": {
                "var1": {
                    "signals": ["signal1"],
                    "dataset_implanting_rate": 0.5,
                    "repertoire_implanting_rate": 0.1
                }
            }
        }

        symbol_table = SymbolTable()
        symbol_table.add("motif1", SymbolType.MOTIF, Motif("motif1", GappedKmerInstantiation(position_weights={0: 1}), seed="CAS"))
        symbol_table.add("signal1", SymbolType.SIGNAL, Signal("signal1", [symbol_table.get("motif1")],
                                                              HealthySequenceImplanting(GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND)))

        symbol_table, specs = SimulationParser.parse_simulations(simulation, symbol_table)

        self.assertTrue(symbol_table.contains("sim1"))
        sim1 = symbol_table.get("sim1")
        self.assertEqual(1, len(sim1.implantings))
예제 #10
0
 def test_add(self):
     symbol_table = SymbolTable()
     symbol_table.add("svm1", SymbolType.ML_METHOD, {})
     with self.assertWarns(Warning):
         symbol_table.add("svm1", SymbolType.ML_METHOD, {})
예제 #11
0
    def test_parse(self):
        path = EnvironmentSettings.root_path / "test/tmp/parser/"

        PathBuilder.build(path / "tmp_input/")
        with open(path / "tmp_input/CD1_clones_TRA.csv", "w") as file:
            writer = csv.DictWriter(file,
                                    delimiter="\t",
                                    fieldnames=[
                                        "patient", "dilution", "cloneCount",
                                        "allVHitsWithScore",
                                        "allJHitsWithScore", "nSeqCDR1",
                                        "nSeqCDR2", "nSeqCDR3", "minQualCDR3",
                                        "aaSeqCDR1", "aaSeqCDR2", "aaSeqCDR3",
                                        "sampleID"
                                    ])
            dicts = [{
                "patient": "CD12",
                "dilution": "108'",
                "cloneCount": 3,
                "allVHitsWithScore": "TRAV13-1*00(735)",
                "allJHitsWithScore": "TRAJ15*00(243)",
                "nSeqCDR1": "TGTGCAGCAA",
                "nSeqCDR2": "TGTGCAGCAA",
                "nSeqCDR3": "TGTGCAGCAA",
                "minQualCDR3": 10,
                "aaSeqCDR1": "VFAVFA",
                "aaSeqCDR2": "VFAVFA",
                "aaSeqCDR3": "VFAVFA",
                "sampleID": "2"
            }, {
                "patient": "CD12",
                "dilution": "108'",
                "cloneCount": 5,
                "allVHitsWithScore": "TRAV14-1*00(735)",
                "allJHitsWithScore": "TRAJ12*00(243)",
                "nSeqCDR1": "CAATGTGA",
                "nSeqCDR2": "CAATGTGA",
                "nSeqCDR3": "CAATGTGA",
                "minQualCDR3": 10,
                "aaSeqCDR1": "CASCAS",
                "aaSeqCDR2": "CASCAS",
                "aaSeqCDR3": "CASCAS",
                "sampleID": "3"
            }]

            writer.writeheader()
            writer.writerows(dicts)

        with open(path / "tmp_input/HC2_clones_TRB.csv", "w") as file:
            writer = csv.DictWriter(file,
                                    delimiter="\t",
                                    fieldnames=[
                                        "patient", "dilution", "cloneCount",
                                        "allVHitsWithScore",
                                        "allJHitsWithScore", "nSeqCDR1",
                                        "nSeqCDR2", "nSeqCDR3", "minQualCDR3",
                                        "aaSeqCDR1", "aaSeqCDR2", "aaSeqCDR3",
                                        "sampleID"
                                    ])
            dicts = [{
                "patient": "CD12",
                "dilution": "108'",
                "cloneCount": 3,
                "allVHitsWithScore": "TRAV13-1*00(735)",
                "allJHitsWithScore": "TRAJ15*00(243)",
                "nSeqCDR1": "TGTGCAGCAA",
                "nSeqCDR2": "TGTGCAGCAA",
                "nSeqCDR3": "TGTGCAGCAA",
                "minQualCDR3": 10,
                "aaSeqCDR1": "CAASNQA",
                "aaSeqCDR2": "CAASNQA",
                "aaSeqCDR3": "CAASNQA",
                "sampleID": "1"
            }, {
                "patient": "CD12",
                "dilution": "108'",
                "cloneCount": 6,
                "allVHitsWithScore": "TRAV19-1*00(735)",
                "allJHitsWithScore": "TRAJ12*00(243)",
                "nSeqCDR1": "CAATGTGA",
                "nSeqCDR2": "CAATGTGA",
                "nSeqCDR3": "CAATGTGA",
                "minQualCDR3": 10,
                "aaSeqCDR1": "CAASNTTA",
                "aaSeqCDR2": "CAASNTTA",
                "aaSeqCDR3": "CAASNTTA",
                "sampleID": 1
            }, {
                "patient": "CD12",
                "dilution": "108'",
                "cloneCount": 6,
                "allVHitsWithScore": "TRAV19-1*00(735)",
                "allJHitsWithScore": "TRAJ12*00(243)",
                "nSeqCDR1": "CAATGTGA",
                "nSeqCDR2": "CAATGTGA",
                "nSeqCDR3": "CAATGTGA",
                "minQualCDR3": 10,
                "aaSeqCDR1": "CAASNTTA",
                "aaSeqCDR2": "CAASNTTA",
                "aaSeqCDR3": "CAASNTTA",
                "sampleID": 1
            }]

            writer.writeheader()
            writer.writerows(dicts)

        metadata = pd.DataFrame({
            "filename": ["HC2_clones_TRB.csv", "CD1_clones_TRA.csv"],
            "subject_id": ["HC2", "CD1"],
            "CD": [False, True]
        })
        metadata.to_csv(path / "metadata.csv")
        specs = {
            "datasets": {
                "d1": {
                    "format": "MiXCR",
                    "params": {
                        "is_repertoire": True,
                        "path": path / "tmp_input/",
                        "metadata_file": path / "metadata.csv",
                        "number_of_processes": 2,
                    }
                }
            }
        }

        st, desc = ImportParser.parse(specs, SymbolTable(),
                                      path / "tmp_output/")
        self.assertTrue(isinstance(st.get("d1"), RepertoireDataset))
        self.assertEqual(2, len(st.get("d1").get_data()))

        shutil.rmtree(path)
    def test_parse(self):

        path = EnvironmentSettings.tmp_test_path / "explanalysisparser/"
        PathBuilder.build(path)

        dataset = self.prepare_dataset(path)
        report1 = SequenceLengthDistribution()

        file_content = """complex.id	Gene	CDR3	V	J	Species	MHC A	MHC B	MHC class	Epitope	Epitope gene	Epitope species	Reference	Method	Meta	CDR3fix	Score
        100a	TRA	AAAC	TRAV12	TRAJ1	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV
        """

        with open(path / "refs.tsv", "w") as file:
            file.writelines(file_content)

        refs = {
            "params": {
                "path": path / "refs.tsv",
                "region_type": "FULL_SEQUENCE"
            },
            "format": "VDJdb"
        }

        report2 = Matches.build_object()
        encoding = MatchedSequencesEncoder
        p1 = [SubjectRepertoireCollector()]

        instruction = {
            "type": "ExploratoryAnalysis",
            "number_of_processes": 32,
            "analyses": {
                "1": {
                    "dataset": "d1",
                    "report": "r1",
                    "preprocessing_sequence": "p1"
                },
                "2": {
                    "dataset": "d1",
                    "report": "r2",
                    "encoding": "e1",
                },
                "3": {
                    "dataset": "d1",
                    "report": "r2",
                    "encoding": "e1",
                    "labels": ["l1"]
                }
            }
        }

        symbol_table = SymbolTable()
        symbol_table.add("d1", SymbolType.DATASET, dataset)
        symbol_table.add("r1", SymbolType.REPORT, report1)
        symbol_table.add("r2", SymbolType.REPORT, report2)
        symbol_table.add(
            "e1", SymbolType.ENCODING, encoding,
            {"encoder_params": {
                "max_edit_distance": 1,
                "reference": refs
            }})
        symbol_table.add("p1", SymbolType.PREPROCESSING, p1)

        process = ExploratoryAnalysisParser().parse("a", instruction,
                                                    symbol_table)

        self.assertEqual(
            3, len(list(process.state.exploratory_analysis_units.values())))
        self.assertTrue(
            isinstance(
                list(process.state.exploratory_analysis_units.values())
                [0].report, SequenceLengthDistribution))

        # testing matches with and without labels
        self.assertTrue(
            isinstance(
                list(process.state.exploratory_analysis_units.values())
                [1].report, Matches))
        self.assertTrue(
            isinstance(
                list(process.state.exploratory_analysis_units.values())
                [1].encoder, MatchedSequencesEncoder))
        self.assertEqual(
            1,
            len(
                list(process.state.exploratory_analysis_units.values())
                [1].encoder.reference_sequences))

        self.assertTrue(
            isinstance(
                list(process.state.exploratory_analysis_units.values())
                [2].report, Matches))
        self.assertTrue(
            isinstance(
                list(process.state.exploratory_analysis_units.values())
                [2].encoder, MatchedSequencesEncoder))
        self.assertEqual(
            1,
            len(
                list(process.state.exploratory_analysis_units.values())
                [2].encoder.reference_sequences))
        self.assertEqual(
            "l1",
            list(process.state.exploratory_analysis_units.values())
            [2].label_config.get_labels_by_name()[0])
        self.assertEqual(
            32,
            process.state.exploratory_analysis_units["2"].number_of_processes)

        shutil.rmtree(path)