Пример #1
0
    def fit(self, encoded_data: EncodedData, label_name: str, cores_for_training: int = 2):

        self.class_mapping = Util.make_class_mapping(encoded_data.labels[label_name])
        self.feature_names = encoded_data.feature_names
        self.label_name = label_name

        mapped_y = Util.map_to_new_class_values(encoded_data.labels[label_name], self.class_mapping)

        self.model = CacheHandler.memo_by_params(self._prepare_caching_params(encoded_data, encoded_data.labels[label_name], self.FIT, label_name),
                                                 lambda: self._fit(encoded_data.examples, mapped_y, cores_for_training))
Пример #2
0
    def fit(self, encoded_data: EncodedData, label_name: str, cores_for_training: int = 2):

        self.feature_names = encoded_data.feature_names

        Util.setup_pytorch(self.number_of_threads, self.random_seed)
        if "chain_names" in encoded_data.info and encoded_data.info["chain_names"] is not None and len(encoded_data.info["chain_names"]) == 2:
            self.chain_names = encoded_data.info["chain_names"]
        else:
            self.chain_names = ["chain_1", "chain_2"]

        self._make_CNN()
        self.CNN.to(device=self.device)

        self.class_mapping = Util.make_binary_class_mapping(encoded_data.labels[label_name])
        self.label_name = label_name

        self.CNN.train()

        iteration = 0
        loss_function = nn.BCEWithLogitsLoss().to(device=self.device)
        optimizer = torch.optim.Adam(self.CNN.parameters(), lr=self.learning_rate, weight_decay=self.l2_weight_decay, eps=1e-4)
        state = dict(model=copy.deepcopy(self.CNN).state_dict(), optimizer=optimizer, iteration=iteration, best_validation_loss=np.inf)
        train_data, validation_data = self._prepare_and_split_data(encoded_data)

        logging.info("ReceptorCNN: starting training.")
        while iteration < self.iteration_count:
            for examples, labels, example_ids in self._get_data_batch(train_data, self.label_name):

                # Reset gradients
                optimizer.zero_grad()

                # Calculate predictions
                logit_outputs = self.CNN(examples)

                # Calculate losses
                loss = self._compute_loss(loss_function, logit_outputs, labels)

                # Perform update
                loss.backward()
                optimizer.step()

                self.CNN.rescale_weights_for_IGM()

                iteration += 1

                # Calculate scores and loss on training set and validation set
                if iteration % self.evaluate_at == 0 or iteration == self.iteration_count or iteration == 1:
                    logging.info(f"ReceptorCNN: training - iteration {iteration}.")
                    state = self._evaluate_state(state, iteration, loss_function, validation_data)

                if iteration >= self.iteration_count:
                    self.CNN.load_state_dict(state["model"])
                    break

        logging.info("ReceptorCNN: finished training.")
Пример #3
0
    def fit_by_cross_validation(self, encoded_data: EncodedData, number_of_splits: int = 5, label_name: str = None, cores_for_training: int = -1,
                                optimization_metric='balanced_accuracy'):

        self.class_mapping = Util.make_class_mapping(encoded_data.labels[label_name])
        self.feature_names = encoded_data.feature_names
        self.label_name = label_name
        mapped_y = Util.map_to_new_class_values(encoded_data.labels[label_name], self.class_mapping)

        self.model = CacheHandler.memo_by_params(
            self._prepare_caching_params(encoded_data, mapped_y, self.FIT_CV, label_name, number_of_splits),
            lambda: self._fit_by_cross_validation(encoded_data.examples, mapped_y, number_of_splits, label_name, cores_for_training,
                                                  optimization_metric))
Пример #4
0
    def fit(self,
            encoded_data: EncodedData,
            label: Label,
            cores_for_training: int = 2):

        self.label = label
        self.class_mapping = Util.make_class_mapping(
            encoded_data.labels[self.label.name])
        self.feature_names = encoded_data.feature_names

        mapped_y = Util.map_to_new_class_values(
            encoded_data.labels[self.label.name], self.class_mapping)

        self.model = self._fit(encoded_data.examples, mapped_y,
                               cores_for_training)
Пример #5
0
    def _score_for_metric(metric: Metric, predicted_y, predicted_proba_y,
                          true_y, labels):
        if hasattr(ml_metrics, metric.value):
            fn = getattr(ml_metrics, metric.value)
        else:
            fn = getattr(metrics, metric.value)

        true_y, predicted_y = Util.binarize_labels(true_y=true_y,
                                                   predicted_y=predicted_y,
                                                   labels=labels)

        try:
            if metric in Metric.get_probability_based_metric_types():
                predictions = predicted_proba_y
                if predicted_proba_y is None:
                    warnings.warn(
                        f"MLMethodAssessment: metric {metric} is specified, but the chosen ML method does not output "
                        f"class probabilities. Using predicted classes instead..."
                    )
                    predictions = predicted_y
            else:
                predictions = predicted_y

            score = fn(true_y, predictions)

        except ValueError as err:
            warnings.warn(
                f"MLMethodAssessment: score for metric {metric.name} could not be calculated."
                f"\nPredicted values: {predicted_y}\nTrue values: {true_y}.\nMore details: {err}",
                RuntimeWarning)
            score = "not computed"

        return score
    def fit(self,
            encoded_data: EncodedData,
            label: Label,
            cores_for_training: int = 2):
        self.feature_names = encoded_data.feature_names
        X = encoded_data.examples
        assert X.shape[1] == 2, "ProbabilisticBinaryClassifier: the shape of the input is not compatible with the classifier. " \
                                "The classifier is defined when examples are encoded by two counts: the number of successful trials " \
                                "and the total number of trials. If this is not targeted use-case and the encoding, please consider using " \
                                "another classifier."

        self.class_mapping = Util.make_binary_class_mapping(
            encoded_data.labels[label.name])
        self.label = label
        self.N_0 = int(
            np.sum(
                np.array(encoded_data.labels[label.name]) ==
                self.class_mapping[0]))
        self.N_1 = int(
            np.sum(
                np.array(encoded_data.labels[label.name]) ==
                self.class_mapping[1]))
        self.alpha_0, self.beta_0 = self._find_beta_distribution_parameters(
            X[np.nonzero(
                np.array(encoded_data.labels[self.label.name]) ==
                self.class_mapping[0])], self.N_0)
        self.alpha_1, self.beta_1 = self._find_beta_distribution_parameters(
            X[np.nonzero(
                np.array(encoded_data.labels[self.label.name]) ==
                self.class_mapping[1])], self.N_1)
Пример #7
0
    def make_html_map(state: DatasetExportState, base_path: Path) -> dict:
        html_map = {
            "css_style": Util.get_css_content(DatasetExportHTMLBuilder.CSS_PATH),
            "name": state.name,
            'immuneML_version': MLUtil.get_immuneML_version(),
            "full_specs": Util.get_full_specs_path(base_path),
            "datasets": [
                {
                    "dataset_name": dataset.name,
                    "dataset_type": StringHelper.camel_case_to_word_string(type(dataset).__name__),
                    "dataset_size": f"{dataset.get_example_count()} {type(dataset).__name__.replace('Dataset', 's').lower()}",
                    "labels": [{"label_name": label} for label in dataset.get_label_names()],
                    "preprocessing_sequence": [
                        {
                            "preprocessing_name": preprocessing.__class__.__name__,
                            "preprocessing_params": ", ".join([f"{key}: {value}" for key, value in vars(preprocessing).items()])
                        } for preprocessing in state.preprocessing_sequence
                    ] if state.preprocessing_sequence is not None else [],
                    "show_preprocessing": state.preprocessing_sequence is not None and len(state.preprocessing_sequence) > 0,
                    "formats": [
                        {
                            "format_name": format_name,
                            "dataset_download_link": os.path.relpath(path=Util.make_downloadable_zip(state.result_path, state.paths[dataset.name][format_name]),
                                                                     start=base_path)
                        } for format_name in state.formats
                    ]
                } for dataset in state.datasets
            ]
        }

        return html_map
Пример #8
0
 def make_html_map(state: MLApplicationState, base_path: Path) -> dict:
     return {
         "css_style":
         Util.get_css_content(MLApplicationHTMLBuilder.CSS_PATH),
         "hp_setting":
         state.hp_setting.get_key(),
         'immuneML_version':
         MLUtil.get_immuneML_version(),
         "label":
         state.label_config.get_labels_by_name()[0],
         "dataset_name":
         state.dataset.name,
         "dataset_type":
         StringHelper.camel_case_to_word_string(
             type(state.dataset).__name__),
         "example_count":
         state.dataset.get_example_count(),
         "dataset_size":
         f"{state.dataset.get_example_count()} {type(state.dataset).__name__.replace('Dataset', 's').lower()}",
         "labels": [{
             "name":
             label_name,
             "values":
             str(state.label_config.get_label_values(label_name))[1:-1]
         } for label_name in state.label_config.get_labels_by_name()],
         "predictions":
         Util.get_table_string_from_csv(state.predictions_path),
         "predictions_download_link":
         os.path.relpath(state.predictions_path, base_path)
     }
Пример #9
0
    def make_html_map(state: DatasetExportState, base_path: Path) -> dict:
        html_map = {
            "css_style":
            Util.get_css_content(DatasetExportHTMLBuilder.CSS_PATH),
            "name":
            state.name,
            'immuneML_version':
            MLUtil.get_immuneML_version(),
            "full_specs":
            Util.get_full_specs_path(base_path),
            "datasets": [{
                "dataset_name":
                dataset.name,
                "dataset_type":
                StringHelper.camel_case_to_word_string(type(dataset).__name__),
                "dataset_size":
                f"{dataset.get_example_count()} {type(dataset).__name__.replace('Dataset', 's').lower()}",
                "labels": [{
                    "label_name": label
                } for label in dataset.get_label_names()],
                "formats": [{
                    "format_name":
                    format_name,
                    "dataset_download_link":
                    os.path.relpath(path=Util.make_downloadable_zip(
                        state.result_path,
                        state.paths[dataset.name][format_name]),
                                    start=base_path)
                } for format_name in state.formats]
            } for dataset in state.datasets]
        }

        return html_map
Пример #10
0
 def predict(self, encoded_data: EncodedData, label_name: str):
     self.check_is_fitted(label_name)
     predictions = self.model.predict(encoded_data.examples)
     return {
         label_name:
         Util.map_to_old_class_values(np.array(predictions),
                                      self.class_mapping)
     }
Пример #11
0
    def make_html_map(state: ExploratoryAnalysisState,
                      base_path: Path) -> dict:
        html_map = {
            "css_style":
            Util.get_css_content(ExploratoryAnalysisHTMLBuilder.CSS_PATH),
            "full_specs":
            Util.get_full_specs_path(base_path),
            'immuneML_version':
            MLUtil.get_immuneML_version(),
            "analyses": [{
                "name":
                name,
                "dataset_name":
                analysis.dataset.name if analysis.dataset.name is not None else
                analysis.dataset.identifier,
                "dataset_type":
                StringHelper.camel_case_to_word_string(
                    type(analysis.dataset).__name__),
                "example_count":
                analysis.dataset.get_example_count(),
                "dataset_size":
                f"{analysis.dataset.get_example_count()} {type(analysis.dataset).__name__.replace('Dataset', 's').lower()}",
                "show_labels":
                analysis.label_config is not None
                and len(analysis.label_config.get_labels_by_name()) > 0,
                "labels": [{
                    "name": label.name,
                    "values": str(label.values)[1:-1]
                } for label in analysis.label_config.get_label_objects()]
                if analysis.label_config else None,
                "encoding_key":
                analysis.encoder.name
                if analysis.encoder is not None else None,
                "encoding_name":
                StringHelper.camel_case_to_word_string(
                    type(analysis.encoder).__name__)
                if analysis.encoder is not None else None,
                "encoding_params": [{
                    "param_name": key,
                    "param_value": value
                } for key, value in vars(analysis.encoder).items()]
                if analysis.encoder is not None else None,
                "show_encoding":
                analysis.encoder is not None,
                "report":
                Util.to_dict_recursive(analysis.report_result, base_path)
            } for name, analysis in state.exploratory_analysis_units.items()]
        }

        for analysis in html_map["analyses"]:
            analysis["show_tables"] = len(
                analysis["report"]["output_tables"]
            ) > 0 if "output_tables" in analysis["report"] else False
            analysis["show_text"] = len(
                analysis["report"]["output_text"]
            ) > 0 if "output_text" in analysis["report"] else False

        return html_map
Пример #12
0
    def fit_by_cross_validation(self,
                                encoded_data: EncodedData,
                                number_of_splits: int = 5,
                                label: Label = None,
                                cores_for_training: int = -1,
                                optimization_metric='balanced_accuracy'):

        self.class_mapping = Util.make_class_mapping(
            encoded_data.labels[label.name])
        self.feature_names = encoded_data.feature_names
        self.label = label
        mapped_y = Util.map_to_new_class_values(
            encoded_data.labels[self.label.name], self.class_mapping)

        self.model = self._fit_by_cross_validation(encoded_data.examples,
                                                   mapped_y, number_of_splits,
                                                   label, cores_for_training,
                                                   optimization_metric)
    def _make_html_map(report_results: dict, result_path: Path, instruction_result_paths: dict) -> dict:
        html_map = {
            "css_style": Util.get_css_content(MultiDatasetBenchmarkHTMLBuilder.CSS_PATH),
            "reports": Util.to_dict_recursive(report_results.values(), result_path),
            'immuneML_version': MLUtil.get_immuneML_version(),
            "show_reports": True,
            "instruction_overviews": [{"name": name, "path": Path(os.path.relpath(path / "index.html", result_path))}
                                      for name, path in instruction_result_paths.items()]
        }

        if len(html_map['reports']) == 0:
            html_map['show_reports'] = False

        return html_map
Пример #14
0
    def _make_roc_curve(self, hp_item: HPItem, label_name: str,
                        proba_name: str) -> dict:
        df = pd.read_csv(hp_item.test_predictions_path)

        true_y = df[f"{label_name}_true_class"].values
        predicted_y = df[proba_name].values

        true_y = Util.map_to_new_class_values(
            true_y, hp_item.method.get_class_mapping())
        fpr, tpr, _ = roc_curve(y_true=true_y, y_score=predicted_y)

        return {
            "FPR": fpr,
            "TPR": tpr,
            "AUC": roc_auc_score(true_y=true_y, predicted_y=predicted_y),
            "HPItem": str(hp_item.hp_setting)
        }
Пример #15
0
    def _make_main_html_map(state: TrainMLModelState, base_path: Path) -> dict:
        html_map = {
            "css_style":
            Util.get_css_content(HPHTMLBuilder.CSS_PATH),
            "full_specs":
            Util.get_full_specs_path(base_path),
            "dataset_name":
            state.dataset.name
            if state.dataset.name is not None else state.dataset.identifier,
            "dataset_type":
            StringHelper.camel_case_to_word_string(
                type(state.dataset).__name__),
            "example_count":
            state.dataset.get_example_count(),
            "dataset_size":
            f"{state.dataset.get_example_count()} {type(state.dataset).__name__.replace('Dataset', 's').lower()}",
            "labels": [{
                "name": label.name,
                "values": str(label.values)[1:-1]
            } for label in state.label_configuration.get_label_objects()],
            "optimization_metric":
            state.optimization_metric.name.lower(),
            "other_metrics":
            str([metric.name.lower()
                 for metric in state.metrics])[1:-1].replace("'", ""),
            "metrics": [{
                "name": metric.name.lower()
            } for metric in state.metrics],
            "assessment_desc":
            state.assessment,
            "selection_desc":
            state.selection,
            "show_hp_reports":
            bool(state.report_results),
            'hp_reports':
            Util.to_dict_recursive(state.report_results, base_path)
            if state.report_results else None,
            "hp_per_label":
            HPHTMLBuilder._make_hp_per_label(state),
            'models_per_label':
            HPHTMLBuilder._make_model_per_label(state, base_path),
            'immuneML_version':
            MLUtil.get_immuneML_version()
        }

        return html_map
Пример #16
0
    def make_html_map(state: SimulationState, base_path: Path) -> dict:

        html_map = {
            "css_style":
            Util.get_css_content(SimulationHTMLBuilder.CSS_PATH),
            "name":
            state.name,
            'immuneML_version':
            MLUtil.get_immuneML_version(),
            "full_specs":
            Util.get_full_specs_path(base_path),
            "dataset_name":
            state.resulting_dataset.name if state.resulting_dataset.name
            is not None else state.resulting_dataset.identifier,
            "dataset_type":
            StringHelper.camel_case_to_word_string(
                type(state.resulting_dataset).__name__),
            "example_count":
            state.resulting_dataset.get_example_count(),
            "dataset_size":
            f"{state.resulting_dataset.get_example_count()} {type(state.resulting_dataset).__name__.replace('Dataset', 's').lower()}",
            "labels": [{
                "label_name": label
            } for label in state.resulting_dataset.get_label_names()],
            "formats": [{
                "format_name":
                format_name,
                "dataset_download_link":
                os.path.relpath(path=Util.make_downloadable_zip(
                    state.result_path,
                    state.paths[state.resulting_dataset.name][format_name]),
                                start=base_path)
            } for format_name in state.formats],
            "implantings": [
                Util.to_dict_recursive(implanting, base_path)
                for implanting in state.simulation.implantings
            ]
        }

        return html_map
Пример #17
0
    def make_html_map(state: SubsamplingState, base_path: Path) -> dict:
        html_map = {
            "css_style":
            Util.get_css_content(SubsamplingHTMLBuilder.CSS_PATH),
            "name":
            state.name,
            'immuneML_version':
            MLUtil.get_immuneML_version(),
            "full_specs":
            Util.get_full_specs_path(base_path),
            "dataset_name":
            state.dataset.name
            if state.dataset.name is not None else state.dataset.identifier,
            "labels": [{
                "label_name": label
            } for label in state.dataset.get_label_names()],
            "dataset_type":
            StringHelper.camel_case_to_word_string(
                type(state.dataset).__name__),
            "example_count":
            state.dataset.get_example_count(),
            "subsampled_datasets": [{
                "sub_dataset_iter":
                i,
                "sub_dataset_name":
                dataset.name,
                "dataset_size":
                f"{dataset.get_example_count()} {type(dataset).__name__.replace('Dataset', 's').lower()}",
                "formats": [{
                    "dataset_download_link": item,
                    "format_name": key
                } for key, item in state.subsampled_dataset_paths[
                    dataset.name].items()]
            } for i, dataset in enumerate(state.subsampled_datasets, 1)]
        }

        return html_map
Пример #18
0
    def fit(self,
            encoded_data: EncodedData,
            label: Label,
            cores_for_training: int = 2):
        self.feature_names = encoded_data.feature_names

        Util.setup_pytorch(self.number_of_threads, self.random_seed)
        self.input_size = encoded_data.examples.shape[1]

        self._make_log_reg()

        self.label = label
        self.class_mapping = Util.make_binary_class_mapping(
            encoded_data.labels[self.label.name])

        loss = np.inf

        state = {"loss": loss, "model": None}
        loss_func = torch.nn.BCEWithLogitsLoss(reduction='mean')
        optimizer = torch.optim.SGD(self.logistic_regression.parameters(),
                                    lr=self.learning_rate)

        for iteration in range(self.iteration_count):

            # reset gradients
            optimizer.zero_grad()

            # compute predictions only for k-mers with max score
            max_logit_indices = self._get_max_logits_indices(
                encoded_data.examples)
            example_count = encoded_data.examples.shape[0]
            examples = torch.from_numpy(encoded_data.examples).float()[
                torch.arange(example_count).long(), :, max_logit_indices]
            logits = self.logistic_regression(examples)

            # compute the loss
            loss = loss_func(
                logits,
                torch.tensor(encoded_data.labels[self.label.name]).float())

            # perform update
            loss.backward()
            optimizer.step()

            # log current score and keep model for early stopping if specified
            if iteration % self.evaluate_at == 0 or iteration == self.iteration_count - 1:
                logging.info(
                    f"AtchleyKmerMILClassifier: log loss at iteration {iteration+1}/{self.iteration_count}: {loss}."
                )
                if state["loss"] < loss and self.use_early_stopping:
                    state = {
                        "loss": loss.numpy(),
                        "model": copy.deepcopy(self.logistic_regression)
                    }

            if loss < self.threshold:
                break

        logging.warning(
            f"AtchleyKmerMILClassifier: the logistic regression model did not converge."
        )

        if loss > state['loss'] and self.use_early_stopping:
            self.logistic_regression.load_state_dict(state["model"])
Пример #19
0
 def get_package_info(self) -> str:
     return Util.get_immuneML_version()
Пример #20
0
    def _make_document(presentations: List[InstructionPresentation], path: Path) -> Path:
        result_path = path / "index.html"
        if len(presentations) > 1:
            html_map = {"instructions": presentations, "css_path": EnvironmentSettings.html_templates_path / "css/custom.css",
                        "full_specs": Util.get_full_specs_path(path), 'immuneML_version': MLUtil.get_immuneML_version()}
            TemplateParser.parse(template_path=EnvironmentSettings.html_templates_path / "index.html",
                                 template_map=html_map, result_path=result_path)
        elif len(presentations) == 1:
            shutil.copyfile(str(presentations[0].path), str(result_path))
            HTMLBuilder._update_paths(result_path)
        else:
            result_path = None

        return result_path
Пример #21
0
 def get_package_info(self) -> str:
     return 'immuneML ' + Util.get_immuneML_version(
     ) + '; deepRC ' + pkg_resources.get_distribution('DeepRC').version