Пример #1
0
    def _generate(self) -> ReportResult:
        from immuneML.util.TCRdistHelper import TCRdistHelper
        from tcrdist.rep_diff import hcluster_diff
        from tcrdist.summarize import member_summ

        PathBuilder.build(self.result_path)

        subsampled_dataset = self._extract_positive_example_dataset()
        reference_sequences = self._extract_reference_sequences()
        tcr_rep = TCRdistHelper.compute_tcr_dist(subsampled_dataset, [self.label.name], self.cores)
        tcr_rep.hcluster_df, tcr_rep.Z = hcluster_diff(clone_df=tcr_rep.clone_df, pwmat=tcr_rep.pw_alpha + tcr_rep.pw_beta, x_cols=["epitope"],
                                                       count_col='count')

        figures, tables = [], []

        logging.info(f'{TCRdistMotifDiscovery.__name__}: created {tcr_rep.hcluster_df.shape[0]} clusters, now discovering motifs in clusters.')

        for index, row in tcr_rep.hcluster_df.iterrows():
            if len(row['neighbors_i']) >= self.min_cluster_size:
                figure_outputs, table_outputs = self._discover_motif_in_cluster(tcr_rep, index, row, reference_sequences)
                figures.extend(figure_outputs)
                tables.extend(table_outputs)

        res_summary = member_summ(res_df=tcr_rep.hcluster_df, clone_df=tcr_rep.clone_df, addl_cols=['epitope'])
        res_summary.to_csv(self.result_path / "tcrdist_summary.csv")

        tables.append(ReportOutput(path=self.result_path / "tcrdist_summary.csv", name="TCRdist summary (csv)"))

        return ReportResult(name=self.name, info="TCRdist motif discovery", output_figures=figures, output_tables=tables)
    def create_dataset(self, path, dataset_size: int = 50):

        sequences = []

        for i in range(dataset_size):
            if i % 2 == 0:
                sequences.append(
                    ReceptorSequence(
                        amino_acid_sequence="AAACCC",
                        identifier=str(i),
                        metadata=SequenceMetadata(custom_params={"l1": 1})))
            else:
                sequences.append(
                    ReceptorSequence(
                        amino_acid_sequence="ACACAC",
                        identifier=str(i),
                        metadata=SequenceMetadata(custom_params={"l1": 2})))

        PathBuilder.build(path)
        filename = path / "sequences.pkl"
        with open(filename, "wb") as file:
            pickle.dump(sequences, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = SequenceDataset(labels={"l1": [1, 2]},
                                  filenames=[filename],
                                  identifier="d1")
        return dataset
    def _create_dummy_data(self, path, dataset_type):
        PathBuilder.build(path)
        dataset = None

        test_repertoire = Repertoire.build(
            sequence_aas=[
                "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH"
            ],
            v_genes=["V1-1" for i in range(5)],
            j_genes=["J1-1" for i in range(5)],
            chains=[
                Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA
            ],
            custom_lists={
                "custom_1": [f"CUST-{i}" for i in range(5)],
                "custom_2":
                [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)]
            },
            cell_ids=[1, 1, 1, 2, 2],
            path=path)

        if dataset_type == "receptor":
            receptordataset_filename = path / "receptors.pkl"
            with open(receptordataset_filename, "wb") as file:
                pickle.dump(test_repertoire.receptors, file)

            dataset = ReceptorDataset(filenames=[receptordataset_filename],
                                      identifier="receptor_dataset")

        elif dataset_type == "repertoire":
            test_repertoire.identifier = "repertoire_dataset"
            dataset = RepertoireDataset(repertoires=[test_repertoire])

        return dataset
Пример #4
0
    def _generate(self) -> ReportResult:
        PathBuilder.build(self.result_path)

        test_metadata_filepath = self.test_dataset.encoded_data.info[
            'metadata_filepath']
        label_names = [self.label]
        hdf5_filepath = self.method._metadata_to_hdf5(test_metadata_filepath,
                                                      label_names)

        n_examples_test = len(self.test_dataset.encoded_data.example_ids)
        indices = np.array(range(n_examples_test))

        dataloader = self.method.make_data_loader(hdf5_filepath,
                                                  pre_loaded_hdf5_file=None,
                                                  indices=indices,
                                                  label=self.label,
                                                  eval_only=True,
                                                  is_train=False)

        model = self.method.get_model(self.label)[self.label]

        compute_contributions(intgrds_set_loader=dataloader,
                              deeprc_model=model,
                              n_steps=self.n_steps,
                              threshold=self.threshold,
                              resdir=self.result_path,
                              filename_inputs=self.filename_inputs,
                              filename_kernels=self.filename_kernels)

        return ReportResult(self.name,
                            output_figures=[
                                ReportOutput(self.filename_inputs),
                                ReportOutput(self.filename_kernels)
                            ])
    def test_generate(self):
        path = EnvironmentSettings.tmp_test_path / "relevant_sequence_exporter/"
        PathBuilder.build(path)

        df = pd.DataFrame({
            "v_genes": ["TRBV1-1", "TRBV1-1"],
            'j_genes': ["TRBJ1-1", "TRBJ1-2"],
            "sequence_aas": ['ACCF', "EEFG"]
        })
        df.to_csv(path / 'sequences.csv', index=False)

        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            2, {2: 1}, {4: 1}, {}, path / "data")
        dataset.encoded_data = EncodedData(
            examples=None,
            info={'relevant_sequence_path': path / 'sequences.csv'},
            encoding="SequenceAbundanceEncoder")

        report_result = RelevantSequenceExporter(dataset, path / "result",
                                                 'somename').generate_report()

        self.assertEqual(1, len(report_result.output_tables))
        self.assertTrue(os.path.isfile(report_result.output_tables[0].path))

        self.assertTrue(
            all(col in ["v_call", "j_call", "cdr3_aa"] for col in pd.read_csv(
                report_result.output_tables[0].path).columns))

        shutil.rmtree(path)
Пример #6
0
    def test_load_sequence_dataset(self):
        """Test dataset content with and without a header included in the input file"""
        path = EnvironmentSettings.root_path / "test/tmp/io_igor_load/"

        PathBuilder.build(path)
        self.write_dummy_files(path, False)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path / "datasets/", "igor")
        params["is_repertoire"] = False
        params["paired"] = False
        params["result_path"] = path
        params["path"] = path
        params["import_with_stop_codon"] = True

        dataset = IGoRImport.import_dataset(params, "igor_seq_dataset")

        seqs = [sequence for sequence in dataset.get_data()]

        self.assertEqual(4, dataset.get_example_count())

        self.assertListEqual(
            sorted([
                "GCGAGACGTGTCTAGGGAGGATATTGTAGTAGTACCAGCTGCTATGACGGGCGGTCCGGTAGTACTACTTTGACTAC",
                "GCGAGAGGCTTCCATGGAACTACAGTAACTACGTTTGTAGGCTGTAGTACTACATGGACGTC",
                "GCGAGAGTTAATCGGCATATTGTGGTGGTGACTGCTATTATGACCGGGTAAAACTGGTTCGACCCC",
                "GCGAGAGATAGGTGGTCAACCCCAGTATTACGATATTTTGACTGGTGGACCCCGCCCTACTACTACTACATGGACGTC"
            ]), sorted([seq.nucleotide_sequence for seq in seqs]))

        shutil.rmtree(path)
Пример #7
0
    def export(hp_item: HPItem, path: Path) -> Path:
        PathBuilder.build(path)
        preproc_filename = MLExporter._store_preprocessing_sequence(
            hp_item.hp_setting.preproc_sequence, path).name
        encoder_filename = MLExporter._store_encoder(
            hp_item.hp_setting.encoder, path).name

        hp_item.method.store(path, hp_item.method.get_feature_names())
        labels_with_values = {
            hp_item.method.get_label(): hp_item.method.get_classes()
        }

        method_config = MLMethodConfiguration(
            labels_with_values=labels_with_values,
            software_used=hp_item.method.get_package_info(),
            encoding_name=hp_item.hp_setting.encoder_name,
            encoding_parameters=hp_item.hp_setting.encoder_params,
            encoding_file=encoder_filename,
            encoding_class=type(hp_item.hp_setting.encoder).__name__,
            ml_method=type(hp_item.method).__name__,
            ml_method_name=hp_item.method.name,
            train_dataset_id=hp_item.train_dataset.identifier,
            train_dataset_name=hp_item.train_dataset.name,
            preprocessing_sequence_name=hp_item.hp_setting.
            preproc_sequence_name,
            preprocessing_file=os.path.basename(preproc_filename),
            preprocessing_parameters={
                type(seq).__name__: vars(seq)
                for seq in hp_item.hp_setting.preproc_sequence
            })

        method_config.store(path / 'ml_config.yaml')

        return path
Пример #8
0
    def make_dummy_dataset(self, path, add_metadata):
        rep1text = """Clone ID	Senior Author	TRAJ Gene	TRAV Gene	CDR3A AA Sequence	TRBV Gene	TRBD Gene	TRBJ Gene	CDR3B AA Sequence	Antigen Protein	Antigen Gene	Antigen Species	Antigen Peptide AA #	Epitope Peptide	MHC Class	HLA Restriction	Counts
1E6	Sewell	TRAJ12	TRAV12-3	CAMRGDSSYKLIF	TRBV12-4	TRBD2	TRBJ2-4	CASSLWEKLAKNIQYF	PPI	INS	Human	12-24	ALWGPDPAAA	MHC I	A*02:01	1
4.13	Nepom	TRAJ44	TRAV19	CALSENRGGTASKLTF	TRBV5-1	TRBD1	TRBJ1-1	CASSLVGGPSSEAFF	GAD		Human	555-567		MHC II	DRB1*04:01	3
5	Roep	TRAJ6	TRAV21	CAVKRTGGSYIPTF	TRBV11-2	TRBD1	TRBJ2-2	CASSSFWGSDTGELFF	Insulin B		Human	9-23		MHC II	DQ8	7
D222D 2	Mallone	TRAJ36*01	TRAV17*01	CAVTGANNLFF	TRBV19*01	TRBD1*01	TRBJ2-2*01	CASSIEGPTGELFF	Zinc Transporter 8	ZnT8	Human	185-194	AVAANIVLTV	MHC I	A*02:01	2
GSE.20D11	Nakayama	TRAJ4	TRAV12-3	CAILSGGYNKLIF	TRBV2	TRBD2	TRBJ2-5	CASSAETQYF	Insulin B		Human	9-23		MHC II	DQ8	10
GSE.6H9	Nakayama	TRAJ40	TRAV26-1	CIVRVDSGTYKYIF	TRBV7-2	TRBD2	TRBJ2-1	CASSLTAGLASTYNEQFF	Insulin B		Human	9-23		MHC II	DQ8/DQ8	nan
iGRP 32	DiLorenzo	TRAJ48	TRAV12-1	CVVNILSNFGNEKLTF	TRBV20/OR9-2	TRBD1	TRBJ2-1	CSASRQGWVNEQFF	IGRP		Human	265-273		MHC I	A*02:01	1
MART-1	TBD	TRAJ23	TRAV12-2	CAVNFGGGKLIF	TRBV6-4	TRBD2	TRBJ1-1	CASSLSFGTEAFF	Melan A		Human	27-35	ELAGIGILTV	MHC I	A2	3
MHB10.3	TBD	TRAJ27	TRAV4	CLVGDSLNTNAGKSTF	TRBV29-1	TRBD2	TRBJ2-2	CSVEDRNTGELFF	Insulin B		Human	11-30		MHC II	DRB1*03:01	NA
PM1#11	TBD	TRAJ54	TRAV35	CAGHSIIQGAQKLVF	TRBV5-1	TRBD2	TRBJ2-1	CASGRSSYNEQFF	GAD		Human	339-352		MHC II	DRB1*03:01	2
R164	Nepom	TRAJ56	TRAV19	CALSEEGGGANSKLTF	TRBV5-1	TRBD2	TRBJ1-6	CASSLAGGANSPLHF	GAD		Human	555-567		MHC II	DRB1*04:01	1
SD32.5	Boehm	TRAJ23	TRAV26-1	CIVRVSSAYYNQGGKLIF	TRBV27	TRBD2	TRBJ2-3	CASSPRANTDTQYF	Insulin A		Human	5-21		MHC II	DRB1*04:01	1
SD52.c1	Boehm	TRAJ27	TRAV4	CLVGDSLNTNAGKSTF	TRBV27	TRBD1	TRBJ1-5	CASSWSSIGNQPQHF	PPI	INS	Human	C18-A1		MHC II	DRB1*04:01	1
T1D#10 C8	TBD	TRAJ26	TRAV12-3	CATAYGQNFVF	TRBV4-1	TRBD2	TRBJ2-2	CASSRGGGNTGELFF	Insulin B		Human	9-23		MHC II	DQ8	1
T1D#3 C8	TBD	TRAJ23	TRAV17	CATDAGYNQGGKLIF	TRBV5-1	TRBD2	TRBJ1-3	CASSAGNTIYF	Insulin B		Human	9-23		MHC II	DQ8	1"""
        PathBuilder.build(path)

        with open(path / "rep1.tsv", "w") as file:
            file.writelines(rep1text)

        if add_metadata:
            with open(path / "metadata.csv", "w") as file:
                file.writelines(
                    """filename,chain,subject_id,coeliac status (yes/no)
rep1.tsv,TRA,1234e,no""")
Пример #9
0
    def standard_scale(scaler_file: Path,
                       design_matrix,
                       with_mean: bool = True):
        """
        scale to zero mean and unit variance on feature level
        :param scaler_file: path to scaler file fitted on train set or where the resulting scaler file will be stored
        :param design_matrix: rows -> examples, columns -> features
        :param with_mean: whether to scale to zero mean or not (could lose sparsity if scaled)
        :return: scaled design matrix
        """

        if with_mean and hasattr(design_matrix, "todense"):
            scaled_design_matrix = design_matrix.todense()
        else:
            scaled_design_matrix = design_matrix

        if scaler_file.is_file():
            with scaler_file.open('rb') as file:
                scaler = pickle.load(file)
                scaled_design_matrix = scaler.transform(scaled_design_matrix)
        else:
            scaler = StandardScaler(with_mean=with_mean)
            scaled_design_matrix = scaler.fit_transform(scaled_design_matrix)

            directory = scaler_file.parent
            PathBuilder.build(directory)

            with scaler_file.open('wb') as file:
                pickle.dump(scaler, file)

        return scaled_design_matrix
Пример #10
0
    def create_dummy_receptordataset(self, path):
        receptors = [TCABReceptor(identifier="1",
                                  alpha=ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1a",
                                                         metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1",
                                                                                   chain=Chain.ALPHA,
                                                                                   frame_type="IN",
                                                                                   custom_params={"d_call": "TRAD1",
                                                                                                  "custom1": "cust1"})),
                                  beta=ReceptorSequence(amino_acid_sequence="ATATAT", identifier="1b",
                                                        metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1",
                                                                                  chain=Chain.BETA,
                                                                                  frame_type="IN",
                                                                                  custom_params={"d_call": "TRBD1",
                                                                                                 "custom1": "cust1"}))),
                     TCABReceptor(identifier="2",
                                  alpha=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2a",
                                                         metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1",
                                                                                   chain=Chain.ALPHA,
                                                                                   frame_type="IN",
                                                                                   custom_params={"d_call": "TRAD1",
                                                                                                  "custom2": "cust1"})),
                                  beta=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2b",
                                                        metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1",
                                                                                  chain=Chain.BETA,
                                                                                  frame_type="IN",
                                                                                  custom_params={"d_call": "TRBD1",
                                                                                                 "custom2": "cust1"})))]

        receptors_path = path / "receptors"
        PathBuilder.build(receptors_path)
        return ReceptorDataset.build_from_objects(receptors, 2, receptors_path)
Пример #11
0
    def create_dataset(self):
        path = Path(
            os.path.relpath(EnvironmentSettings.root_path /
                            "test/tmp/immunemlapp/initial_dataset"))
        PathBuilder.build(path)

        repertoire_count = 30
        repertoires, metadata = RepertoireBuilder.build(
            [["AA", "AAAA", "AAAA", "AAA"] for i in range(repertoire_count)],
            path, {
                "CD": [
                    'yes' if i % 2 == 0 else 'no'
                    for i in range(repertoire_count)
                ],
                "CMV": [
                    True if i % 2 == 1 else False
                    for i in range(repertoire_count)
                ]
            }, [[{
                "chain": "A" if i % 2 == 0 else "B",
                "count": random.randint(2, 5)
            } for i in range(4)] for j in range(repertoire_count)])

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    labels={
                                        "CD": [True, False],
                                        "CMV": [True, False]
                                    },
                                    name="d1")
        PickleExporter.export(dataset, path)

        return path / "d1.iml_dataset"
Пример #12
0
    def test_repertoire_export(self):
        path = EnvironmentSettings.tmp_test_path / "airr_exporter_repertoire/"
        PathBuilder.build(path)

        repertoire, metadata_path = self.create_dummy_repertoire(path)
        dataset = RepertoireDataset(repertoires=[repertoire], metadata_file=metadata_path)

        path_exported = path / "exported"
        AIRRExporter.export(dataset, path_exported)

        resulting_data = pd.read_csv(path_exported / f"repertoires/{repertoire.identifier}.tsv", sep="\t")

        self.assertListEqual(list(resulting_data["sequence_id"]), ["receptor_1", "receptor_2"])
        self.assertListEqual(list(resulting_data["cdr3"]), ["GCTGCTGCT", "GGTGGTGGT"])
        self.assertListEqual(list(resulting_data["cdr3_aa"]), ["AAA", "GGG"])
        self.assertListEqual(list(resulting_data["v_call"]), ["TRBV1", "TRAV2*01"])
        self.assertListEqual(list(resulting_data["j_call"]), ["TRBJ1", "TRAJ2"])
        self.assertListEqual(list(resulting_data["d_call"]), ["TRBD1", "TRAD2"])
        self.assertListEqual(list(resulting_data["locus"]), ["TRB", "TRA"])
        self.assertListEqual(list(resulting_data["duplicate_count"]), [5, 15])
        self.assertListEqual(list(resulting_data["custom_test"]), ["cust1", "cust2"])
        self.assertListEqual(list(resulting_data["productive"]), ['T', 'F'])
        self.assertListEqual(list(resulting_data["stop_codon"]), ['F', 'F'])

        shutil.rmtree(path)
Пример #13
0
    def test_sequence_export(self):
        path = EnvironmentSettings.tmp_test_path / "airr_exporter_receptor/"
        PathBuilder.build(path)

        dataset = self.create_dummy_sequencedataset(path)

        path_exported = path / "exported_sequences"
        AIRRExporter.export(dataset, path_exported)

        resulting_data = pd.read_csv(path_exported / "batch1.tsv", sep="\t")

        self.assertListEqual(list(resulting_data["sequence_id"]), ["1a", "1b"])
        self.assertListEqual(list(resulting_data["cdr3_aa"]), ["AAATTT", "ATATAT"])
        self.assertListEqual(list(resulting_data["v_call"]), ["TRAV1", "TRBV1"])
        self.assertListEqual(list(resulting_data["j_call"]), ["TRAJ1", "TRBJ1"])
        self.assertListEqual(list(resulting_data["d_call"]), ["TRAD1", "TRBD1"])
        self.assertListEqual(list(resulting_data["locus"]), ["TRA", "TRB"])
        self.assertListEqual(list(resulting_data["custom1"]), ["cust1", nan])
        self.assertListEqual(list(resulting_data["custom2"]), [nan, "cust1"])
        self.assertListEqual(list(resulting_data["productive"]), ['T', 'T'])
        self.assertListEqual(list(resulting_data["stop_codon"]), ['F', 'F'])

        resulting_data = pd.read_csv(path_exported / "batch2.tsv", sep="\t")
        self.assertListEqual(list(resulting_data["sequence_id"]), ["2b"])
        self.assertListEqual(list(resulting_data["cdr3_aa"]), ["ATATAT"])
        self.assertListEqual(list(resulting_data["v_call"]), ["TRBV1"])
        self.assertListEqual(list(resulting_data["j_call"]), ["TRBJ1"])
        self.assertListEqual(list(resulting_data["d_call"]), ["TRBD1"])
        self.assertListEqual(list(resulting_data["locus"]), ["TRB"])
        self.assertListEqual(list(resulting_data["custom2"]), ["cust1"])
        self.assertListEqual(list(resulting_data["productive"]), ['T'])
        self.assertListEqual(list(resulting_data["stop_codon"]), ['F'])

        shutil.rmtree(path)
Пример #14
0
    def test_make_subset(self):
        sequences = []
        for i in range(100):
            sequences.append(ReceptorSequence(amino_acid_sequence="AAA", identifier=str(i)))

        path = EnvironmentSettings.tmp_test_path / "element_generator_subset/"
        PathBuilder.build(path)

        for i in range(10):
            filepath = path / f"batch{i}.npy"
            sequences_to_pickle = sequences[i * 10:(i + 1) * 10]
            sequence_matrix = np.core.records.fromrecords([seq.get_record() for seq in sequences_to_pickle], names=ReceptorSequence.get_record_names())
            np.save(str(filepath), sequence_matrix, allow_pickle=False)

        d = SequenceDataset(filenames=[path / f"batch{i}.npy" for i in range(10)], file_size=10)

        indices = [1, 20, 21, 22, 23, 24, 25, 50, 52, 60, 70, 77, 78, 90, 92]

        d2 = d.make_subset(indices, path, SequenceDataset.TRAIN)

        for batch in d2.get_batch(1000):
            for sequence in batch:
                self.assertTrue(int(sequence.identifier) in indices)

        self.assertEqual(15, d2.get_example_count())

        shutil.rmtree(path)
Пример #15
0
    def test_load_repertoire(self):
        """Test dataset content with and without a header included in the input file"""
        path = EnvironmentSettings.root_path / "test/tmp/io_igor_load/"

        PathBuilder.build(path)
        self.write_dummy_files(path, True)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path / "datasets/", "igor")
        params["is_repertoire"] = True
        params["result_path"] = path
        params["path"] = path
        params["metadata_file"] = path / "metadata.csv"

        dataset = IGoRImport.import_dataset(params, "igor_repertoire_dataset")

        self.assertEqual(2, dataset.get_example_count())
        self.assertEqual(len(dataset.repertoires[0].sequences), 1)
        self.assertEqual(len(dataset.repertoires[1].sequences), 1)

        self.assertEqual(
            dataset.repertoires[0].sequences[0].amino_acid_sequence,
            "ARDRWSTPVLRYFDWWTPPYYYYMDV")

        self.assertListEqual(list(dataset.repertoires[0].get_counts()), [1])
        self.assertEqual(dataset.repertoires[0].get_chains(), None)

        shutil.rmtree(path)
Пример #16
0
    def _generate(self) -> ReportResult:
        report_result = ReportResult(
            name=self.name,
            info=
            "Plots ROC curves for all trained ML settings ([preprocessing], encoding, ML model) in the outer loop of cross-validation in the TrainMLModel instruction"
        )

        PathBuilder.build(self.result_path)

        for label in self.state.label_configuration.get_label_objects():
            if len(label.values) != 2:
                logging.warning(
                    f"{ROCCurveSummary.__name__}: report {self.name} is skipping label {label.name} as it has {len(label.values)} "
                    f"classes, while this report expects 2 classes.")
            elif label.positive_class is None:
                logging.warning(
                    f"{ROCCurveSummary.__name__}: report {self.name} is skipping label {label.name} because 'positive_class' parameter "
                    f"is not set.")
            else:
                for index in range(self.state.assessment.split_count):
                    figure = self._create_figure_for_assessment_split(
                        index, label)
                    report_result.output_figures.append(figure)

        return report_result
Пример #17
0
    def test_load_repertoire_with_stop_codon(self):
        path = EnvironmentSettings.root_path / "test/tmp/io_igor_load/"

        PathBuilder.build(path)
        self.write_dummy_files(path, True)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path / "datasets/", "igor")
        params["is_repertoire"] = True
        params["result_path"] = path
        params["path"] = path
        params["import_with_stop_codon"] = True
        params["metadata_file"] = path / "metadata.csv"

        dataset_stop_codons = IGoRImport.import_dataset(
            params, "igor_dataset_stop")

        self.assertEqual(2, dataset_stop_codons.get_example_count())
        self.assertEqual(len(dataset_stop_codons.repertoires[0].sequences), 2)
        self.assertEqual(len(dataset_stop_codons.repertoires[1].sequences), 2)

        self.assertEqual(
            dataset_stop_codons.repertoires[0].sequences[0].
            amino_acid_sequence, "ARVNRHIVVVTAIMTG*NWFDP")

        shutil.rmtree(path)
Пример #18
0
    def encode_dataset(dataset,
                       hp_setting: HPSetting,
                       path: Path,
                       learn_model: bool,
                       context: dict,
                       number_of_processes: int,
                       label_configuration: LabelConfiguration,
                       encode_labels: bool = True,
                       store_encoded_data: bool = False):
        PathBuilder.build(path)

        encoded_dataset = DataEncoder.run(
            DataEncoderParams(dataset=dataset,
                              encoder=hp_setting.encoder,
                              encoder_params=EncoderParams(
                                  model=hp_setting.encoder_params,
                                  result_path=path,
                                  pool_size=number_of_processes,
                                  label_config=label_configuration,
                                  learn_model=learn_model,
                                  filename="train_dataset.pkl"
                                  if learn_model else "test_dataset.pkl",
                                  encode_labels=encode_labels),
                              store_encoded_data=store_encoded_data))
        return encoded_dataset
    def _generate(self) -> ReportResult:

        figures, tables = [], []

        PathBuilder.build(self.result_path)

        if ReferenceSequenceOverlap._check_encoder_class(
                self.state.optimal_hp_items[self.label].encoder):
            figure, data = self._compute_optimal_model_overlap()
            figures.append(figure)
            tables.append(data)

        for assessment_state in self.state.assessment_states:
            encoder = assessment_state.label_states[
                self.label].optimal_assessment_item.encoder
            if ReferenceSequenceOverlap._check_encoder_class(encoder):
                figure_filename = self.result_path / f"assessment_split_{assessment_state.split_index + 1}_model_vs_reference_overlap_{self.label}.pdf"
                df_filename = self.result_path / f"assessment_split_{assessment_state.split_index + 1}_overlap_sequences_{self.label}"
                figure, data = self._compute_model_overlap(
                    figure_filename, df_filename, encoder,
                    f"overlap sequences between the model for assessment split "
                    f"{assessment_state.split_index + 1} and reference list")
                figures.append(figure)
                tables.append(data)

        return ReportResult(self.name,
                            output_figures=figures,
                            output_tables=tables)
Пример #20
0
 def _generate(self) -> ReportResult:
     PathBuilder.build(self.result_path)
     data_long_format = DataReshaper.reshape(self.dataset)
     table_result = self._write_results_table(data_long_format)
     report_output_fig = self._safe_plot(data_long_format=data_long_format)
     output_figures = None if report_output_fig is None else [report_output_fig]
     return ReportResult(self.name, output_figures, [table_result])
Пример #21
0
    def _generate(self) -> ReportResult:
        PathBuilder.build(self.result_path)
        paths = []

        # make predictions
        predictions = self.method.predict(
            self.test_dataset.encoded_data,
            self.label)[self.label]  # label = disease

        true_labels = self.test_dataset.get_metadata(self.metadata_labels +
                                                     [self.label])
        metrics = ["FP", "FN"]

        plot = make_subplots(rows=len(self.metadata_labels), cols=2)
        listOfPlot = []

        for label_index, meta_label in enumerate(self.metadata_labels):
            csv_data = {}
            for metric_index, metric in enumerate(metrics):
                plotting_data = self._metrics(metric=metric,
                                              label=self.label,
                                              meta_label=meta_label,
                                              predictions=predictions,
                                              true_labels=true_labels)

                csv_data[f"{metric}"] = plotting_data[f"{metric}"]

                plot.add_trace(go.Bar(x=plotting_data[meta_label],
                                      y=plotting_data[metric]),
                               row=label_index + 1,
                               col=metric_index + 1)
                plot.update_xaxes(title_text=f"{meta_label}",
                                  row=label_index + 1,
                                  col=metric_index + 1,
                                  type='category')
                plot.update_yaxes(title_text=f"{metric}",
                                  row=label_index + 1,
                                  col=metric_index + 1,
                                  rangemode="nonnegative",
                                  tick0=0,
                                  dtick=1)

            csv_data[f"{meta_label}"] = plotting_data[f"{meta_label}"]

            csv_data = pd.DataFrame(csv_data)

            listOfPlot.append(csv_data)

        plot.update_traces(marker_color=px.colors.sequential.Teal[3],
                           showlegend=False)
        filename = self.result_path / "plots.html"
        plot.write_html(str(filename))
        report_output_fig = ReportOutput(filename)
        paths.append(report_output_fig)

        result_table_path = self._write_results_table(listOfPlot,
                                                      self.metadata_labels)
        return ReportResult(name=self.name,
                            output_figures=paths,
                            output_tables=[ReportOutput(result_table_path[0])])
Пример #22
0
    def test_run(self):
        path = EnvironmentSettings.tmp_test_path / "galaxy_api_dataset_generation/"
        PathBuilder.build(path)
        yaml_path = path / "specs.yaml"
        result_path = path / "results/"

        PathBuilder.build(path)
        self.prepare_specs(yaml_path)

        run_immuneML(
            Namespace(
                **{
                    "specification_path": yaml_path,
                    "result_path": result_path,
                    'tool': "DatasetGenerationTool"
                }))

        self.assertTrue(
            os.path.isfile(result_path / "result/dataset_metadata.csv"))
        self.assertTrue(
            os.path.isfile(result_path / "result/dataset.iml_dataset"))
        self.assertEqual(
            200,
            len([
                name
                for name in os.listdir(result_path / "result/repertoires/")
                if os.path.isfile(
                    os.path.join(result_path / "result/repertoires/", name))
            ]))

        shutil.rmtree(path)
Пример #23
0
    def test_generate(self):
        path = EnvironmentSettings.root_path / "test/tmp/featuredistribution/"
        PathBuilder.build(path)

        dataset = self._create_dummy_encoded_data(path)

        report = FeatureComparison.build_object(**{"dataset": dataset,
                                                     "result_path": path,
                                                     "comparison_label": "patient"})

        self.assertTrue(report.check_prerequisites())

        result = report.generate_report()

        self.assertIsInstance(result, ReportResult)

        self.assertEqual(result.output_figures[0].path, path / "feature_comparison.html")
        self.assertEqual(result.output_tables[0].path, path / "feature_values.csv")

        content = pd.read_csv(path / "feature_values.csv")
        self.assertListEqual(list(content.columns),
                             ["patient", "example_id", "sequence", "feature", "value"])

        # report should succeed to build but check_prerequisites should be false when data is not encoded
        report = FeatureDistribution.build_object(**{"dataset": RepertoireDataset(),
                                                     "result_path": path})

        self.assertFalse(report.check_prerequisites())

        shutil.rmtree(path)
    def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset:
        SubjectRepertoireCollector.check_dataset_type(dataset, [RepertoireDataset], "SubjectRepertoireCollector")

        rep_map = {}
        repertoires = []
        indices_to_keep = []

        processed_dataset = dataset.clone()
        PathBuilder.build(params["result_path"])

        for index, repertoire in enumerate(processed_dataset.get_data()):
            if repertoire.metadata["subject_id"] in rep_map.keys():
                sequences = np.append(repertoire.sequences, rep_map[repertoire.metadata["subject_id"]].sequences)
                del rep_map[repertoire.metadata["subject_id"]]
                repertoires.append(SubjectRepertoireCollector.store_repertoire(
                    params["result_path"], repertoire, sequences))
            else:
                rep_map[repertoire.metadata["subject_id"]] = repertoire
                indices_to_keep.append(index)

        for key in rep_map.keys():
            repertoires.append(SubjectRepertoireCollector.store_repertoire(params["result_path"], rep_map[key], rep_map[key].sequences))

        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = SubjectRepertoireCollector.build_new_metadata(dataset, indices_to_keep, params["result_path"])

        return processed_dataset
    def test_run(self):

        path = EnvironmentSettings.tmp_test_path / "mlapplicationtest/"
        PathBuilder.build(path)

        dataset = RandomDatasetGenerator.generate_repertoire_dataset(50, {5: 1}, {5: 1}, {"l1": {1: 0.5, 2: 0.5}}, path / 'dataset/')
        ml_method = LogisticRegression()
        encoder = KmerFreqRepertoireEncoder(NormalizationType.RELATIVE_FREQUENCY, ReadsType.UNIQUE, SequenceEncodingType.CONTINUOUS_KMER, 3,
                                            scale_to_zero_mean=True, scale_to_unit_variance=True)
        label_config = LabelConfiguration([Label("l1", [1, 2])])

        enc_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config, filename="tmp_enc_dataset.pickle", pool_size=4))
        ml_method.fit(enc_dataset.encoded_data, 'l1')

        hp_setting = HPSetting(encoder, {"normalization_type": "relative_frequency", "reads": "unique", "sequence_encoding": "continuous_kmer",
                                         "k": 3, "scale_to_zero_mean": True, "scale_to_unit_variance": True}, ml_method, {}, [], 'enc1', 'ml1')

        PathBuilder.build(path / 'result/instr1/')
        shutil.copy(path / 'dict_vectorizer.pickle', path / 'result/instr1/dict_vectorizer.pickle')
        shutil.copy(path / 'scaler.pickle', path / 'result/instr1/scaler.pickle')

        ml_app = MLApplicationInstruction(dataset, label_config, hp_setting, 4, "instr1", False)
        ml_app.run(path / 'result/')

        predictions_path = path / "result/instr1/predictions.csv"
        self.assertTrue(os.path.isfile(predictions_path))

        df = pd.read_csv(predictions_path)
        self.assertEqual(50, df.shape[0])

        shutil.rmtree(path)
    def store(self, path: Path, feature_names=None, details_path=None):
        content = self._convert_object_to_dict()
        PathBuilder.build(path)
        file_path = path / FilenameHandler.get_filename(
            self.__class__.__name__, "pickle")

        with file_path.open("wb") as file:
            pickle.dump(content, file)

        if details_path is None:
            params_path = path / FilenameHandler.get_filename(
                self.__class__.__name__, "yaml")
        else:
            params_path = details_path

        with params_path.open("w") as file:
            desc = {
                self.label.name: {
                    **content, "feature_names": feature_names,
                    "classes": list(self.class_mapping.values())
                }
            }
            if self.label is not None:
                desc["label"] = vars(self.label)
            yaml.dump(desc, file)
    def test_repertoire_dataset(self):
        path = EnvironmentSettings.root_path / "test/tmp/cytoscape_export/"
        PathBuilder.build(path)

        repertoire_dataset = self._create_dummy_data(path / "data",
                                                     dataset_type="repertoire")

        cne = CytoscapeNetworkExporter(repertoire_dataset,
                                       path,
                                       chains=("alpha", "beta"),
                                       drop_duplicates=True,
                                       additional_node_attributes=["custom_1"],
                                       additional_edge_attributes=["custom_2"])

        result = cne._generate()

        self.assertIsInstance(result, ReportResult)
        self.assertTrue(os.path.isfile(result.output_tables[0].path))
        self.assertTrue(os.path.isfile(result.output_tables[1].path))
        self.assertTrue(os.path.isfile(result.output_tables[2].path))
        self.assertTrue(os.path.isfile(result.output_tables[3].path))

        with open(path / "repertoire_dataset/all_chains.sif") as file:
            self.assertListEqual(file.readlines(), [
                '*tra*s=DUPDUP*v=V1-1*j=J1-1\tpair\t*trb*s=AILUDGYF*v=V1-1*j=J1-1\n',
                '*tra*s=DUPDUP*v=V1-1*j=J1-1\tpair\t*trb*s=DFJKHJ*v=V1-1*j=J1-1\n',
                '*tra*s=DIUYUAG*v=V1-1*j=J1-1\tpair\t*trb*s=CTGTCGH*v=V1-1*j=J1-1\n'
            ])

        with open(path / "repertoire_dataset/shared_chains.sif") as file:
            self.assertListEqual(file.readlines(), [
                '*tra*s=DUPDUP*v=V1-1*j=J1-1\tpair\t*trb*s=AILUDGYF*v=V1-1*j=J1-1\n',
                '*tra*s=DUPDUP*v=V1-1*j=J1-1\tpair\t*trb*s=DFJKHJ*v=V1-1*j=J1-1\n'
            ])

        with open(path / "repertoire_dataset/node_metadata.tsv") as file:
            self.assertEqual(
                file.readline(),
                'shared_name\tchain\tsequence\tv_subgroup\tv_gene\tj_subgroup\tj_gene\tcustom_1\tn_duplicates\n'
            )

            self.assertListEqual(
                sorted(file.readlines()),
                sorted([
                    '*tra*s=DUPDUP*v=V1-1*j=J1-1\talpha\tDUPDUP\tTRAV1\tTRAV1-1\tTRAJ1\tTRAJ1-1\tCUST-0\t2\n',
                    '*trb*s=AILUDGYF*v=V1-1*j=J1-1\tbeta\tAILUDGYF\tTRBV1\tTRBV1-1\tTRBJ1\tTRBJ1-1\tCUST-1\t1\n',
                    '*trb*s=DFJKHJ*v=V1-1*j=J1-1\tbeta\tDFJKHJ\tTRBV1\tTRBV1-1\tTRBJ1\tTRBJ1-1\tCUST-2\t1\n',
                    '*tra*s=DIUYUAG*v=V1-1*j=J1-1\talpha\tDIUYUAG\tTRAV1\tTRAV1-1\tTRAJ1\tTRAJ1-1\tCUST-3\t1\n',
                    '*trb*s=CTGTCGH*v=V1-1*j=J1-1\tbeta\tCTGTCGH\tTRBV1\tTRBV1-1\tTRBJ1\tTRBJ1-1\tCUST-4\t1\n'
                ]))

        with open(path / "repertoire_dataset/edge_metadata.tsv") as file:
            self.assertListEqual(file.readlines(), [
                'shared_name\tcustom_2\n',
                '*tra*s=DUPDUP*v=V1-1*j=J1-1 (pair) *trb*s=AILUDGYF*v=V1-1*j=J1-1\tCUST-A\n',
                '*tra*s=DUPDUP*v=V1-1*j=J1-1 (pair) *trb*s=DFJKHJ*v=V1-1*j=J1-1\tCUST-A\n',
                '*tra*s=DIUYUAG*v=V1-1*j=J1-1 (pair) *trb*s=CTGTCGH*v=V1-1*j=J1-1\tCUST-B\n'
            ])

        shutil.rmtree(path)
Пример #28
0
    def test_generate(self):
        path = EnvironmentSettings.root_path / "test/tmp/motifseedrecovery/"
        PathBuilder.build(path)

        report = self._create_report(path)

        # Running the report
        result = report.generate_report()

        self.assertIsInstance(result, ReportResult)
        self.assertEqual(result.output_tables[0].path,
                         path / "motif_seed_recovery.csv")
        self.assertEqual(result.output_figures[0].path,
                         path / "motif_seed_recovery.html")

        # Actual tests
        self.assertTrue(os.path.isfile(path / "motif_seed_recovery.csv"))
        self.assertTrue(os.path.isfile(path / "motif_seed_recovery.html"))

        written_data = pd.read_csv(path / "motif_seed_recovery.csv")

        self.assertListEqual(list(written_data.columns),
                             ["features", "max_seed_overlap", "coefficients"])
        self.assertListEqual(list(written_data["coefficients"]),
                             [i for i in range(5)])
        self.assertListEqual(list(written_data["features"]),
                             ["AAA", "AAC", "CKJ", "KSA", "AKJ"])
        self.assertListEqual(list(written_data["max_seed_overlap"]),
                             [3, 2, 0, 1, 1])

        shutil.rmtree(path)
    def test_create_model(self):
        test_path = EnvironmentSettings.root_path / "test/tmp/w2v_test_tmp/"

        PathBuilder.build(test_path)

        sequence1 = ReceptorSequence("CASSVFA")
        sequence2 = ReceptorSequence("CASSCCC")

        metadata1 = {"T1D": "T1D", "subject_id": "1"}
        rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2],
                                                      test_path, metadata1)

        metadata2 = {"T1D": "CTL", "subject_id": "2"}
        rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path,
                                                      metadata2)

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        model_creator = KmerPairModelCreator()
        model = model_creator.create_model(dataset=dataset,
                                           k=2,
                                           vector_size=16,
                                           batch_size=1,
                                           model_path=test_path /
                                           "model.model")

        self.assertTrue(isinstance(model, Word2Vec))
        self.assertTrue("CA" in model.wv.vocab)
        self.assertEqual(400, len(model.wv.vocab))

        shutil.rmtree(test_path)
Пример #30
0
    def test_generate(self):
        path = EnvironmentSettings.root_path / "test/tmp/logregcoefsreport/"
        PathBuilder.build(path)

        report = self._create_report(path)

        # Running the report
        result = report.generate_report()

        self.assertIsInstance(result, ReportResult)
        self.assertEqual(result.output_tables[0].path, path / "coefficients.csv")
        self.assertEqual(result.output_figures[0].path, path / "all_coefficients.html")
        self.assertEqual(result.output_figures[1].path, path / "nonzero_coefficients.html")
        self.assertEqual(result.output_figures[2].path, path / "cutoff_10_coefficients.html")
        self.assertEqual(result.output_figures[3].path, path / "largest_5_coefficients.html")

        # Actual tests
        self.assertTrue(os.path.isfile(path / "coefficients.csv"))
        self.assertTrue(os.path.isfile(path / "all_coefficients.html"))
        self.assertTrue(os.path.isfile(path / "nonzero_coefficients.html"))
        self.assertTrue(os.path.isfile(path / "cutoff_10_coefficients.html"))
        self.assertTrue(os.path.isfile(path / "largest_5_coefficients.html"))

        written_data = pd.read_csv(path / "coefficients.csv")

        self.assertListEqual(list(written_data.columns), ["features", "coefficients"])
        self.assertListEqual(list(written_data["coefficients"]), list(reversed([i for i in range(20)])))
        self.assertListEqual(list(written_data["features"]), list(reversed([f"feature{i}" for i in range(20)])))

        shutil.rmtree(path)