def fit(self, encoded_data: EncodedData, label_name: str, cores_for_training: int = 2): self.class_mapping = Util.make_class_mapping(encoded_data.labels[label_name]) self.feature_names = encoded_data.feature_names self.label_name = label_name mapped_y = Util.map_to_new_class_values(encoded_data.labels[label_name], self.class_mapping) self.model = CacheHandler.memo_by_params(self._prepare_caching_params(encoded_data, encoded_data.labels[label_name], self.FIT, label_name), lambda: self._fit(encoded_data.examples, mapped_y, cores_for_training))
def fit(self, encoded_data: EncodedData, label_name: str, cores_for_training: int = 2): self.feature_names = encoded_data.feature_names Util.setup_pytorch(self.number_of_threads, self.random_seed) if "chain_names" in encoded_data.info and encoded_data.info["chain_names"] is not None and len(encoded_data.info["chain_names"]) == 2: self.chain_names = encoded_data.info["chain_names"] else: self.chain_names = ["chain_1", "chain_2"] self._make_CNN() self.CNN.to(device=self.device) self.class_mapping = Util.make_binary_class_mapping(encoded_data.labels[label_name]) self.label_name = label_name self.CNN.train() iteration = 0 loss_function = nn.BCEWithLogitsLoss().to(device=self.device) optimizer = torch.optim.Adam(self.CNN.parameters(), lr=self.learning_rate, weight_decay=self.l2_weight_decay, eps=1e-4) state = dict(model=copy.deepcopy(self.CNN).state_dict(), optimizer=optimizer, iteration=iteration, best_validation_loss=np.inf) train_data, validation_data = self._prepare_and_split_data(encoded_data) logging.info("ReceptorCNN: starting training.") while iteration < self.iteration_count: for examples, labels, example_ids in self._get_data_batch(train_data, self.label_name): # Reset gradients optimizer.zero_grad() # Calculate predictions logit_outputs = self.CNN(examples) # Calculate losses loss = self._compute_loss(loss_function, logit_outputs, labels) # Perform update loss.backward() optimizer.step() self.CNN.rescale_weights_for_IGM() iteration += 1 # Calculate scores and loss on training set and validation set if iteration % self.evaluate_at == 0 or iteration == self.iteration_count or iteration == 1: logging.info(f"ReceptorCNN: training - iteration {iteration}.") state = self._evaluate_state(state, iteration, loss_function, validation_data) if iteration >= self.iteration_count: self.CNN.load_state_dict(state["model"]) break logging.info("ReceptorCNN: finished training.")
def fit_by_cross_validation(self, encoded_data: EncodedData, number_of_splits: int = 5, label_name: str = None, cores_for_training: int = -1, optimization_metric='balanced_accuracy'): self.class_mapping = Util.make_class_mapping(encoded_data.labels[label_name]) self.feature_names = encoded_data.feature_names self.label_name = label_name mapped_y = Util.map_to_new_class_values(encoded_data.labels[label_name], self.class_mapping) self.model = CacheHandler.memo_by_params( self._prepare_caching_params(encoded_data, mapped_y, self.FIT_CV, label_name, number_of_splits), lambda: self._fit_by_cross_validation(encoded_data.examples, mapped_y, number_of_splits, label_name, cores_for_training, optimization_metric))
def fit(self, encoded_data: EncodedData, label: Label, cores_for_training: int = 2): self.label = label self.class_mapping = Util.make_class_mapping( encoded_data.labels[self.label.name]) self.feature_names = encoded_data.feature_names mapped_y = Util.map_to_new_class_values( encoded_data.labels[self.label.name], self.class_mapping) self.model = self._fit(encoded_data.examples, mapped_y, cores_for_training)
def _score_for_metric(metric: Metric, predicted_y, predicted_proba_y, true_y, labels): if hasattr(ml_metrics, metric.value): fn = getattr(ml_metrics, metric.value) else: fn = getattr(metrics, metric.value) true_y, predicted_y = Util.binarize_labels(true_y=true_y, predicted_y=predicted_y, labels=labels) try: if metric in Metric.get_probability_based_metric_types(): predictions = predicted_proba_y if predicted_proba_y is None: warnings.warn( f"MLMethodAssessment: metric {metric} is specified, but the chosen ML method does not output " f"class probabilities. Using predicted classes instead..." ) predictions = predicted_y else: predictions = predicted_y score = fn(true_y, predictions) except ValueError as err: warnings.warn( f"MLMethodAssessment: score for metric {metric.name} could not be calculated." f"\nPredicted values: {predicted_y}\nTrue values: {true_y}.\nMore details: {err}", RuntimeWarning) score = "not computed" return score
def fit(self, encoded_data: EncodedData, label: Label, cores_for_training: int = 2): self.feature_names = encoded_data.feature_names X = encoded_data.examples assert X.shape[1] == 2, "ProbabilisticBinaryClassifier: the shape of the input is not compatible with the classifier. " \ "The classifier is defined when examples are encoded by two counts: the number of successful trials " \ "and the total number of trials. If this is not targeted use-case and the encoding, please consider using " \ "another classifier." self.class_mapping = Util.make_binary_class_mapping( encoded_data.labels[label.name]) self.label = label self.N_0 = int( np.sum( np.array(encoded_data.labels[label.name]) == self.class_mapping[0])) self.N_1 = int( np.sum( np.array(encoded_data.labels[label.name]) == self.class_mapping[1])) self.alpha_0, self.beta_0 = self._find_beta_distribution_parameters( X[np.nonzero( np.array(encoded_data.labels[self.label.name]) == self.class_mapping[0])], self.N_0) self.alpha_1, self.beta_1 = self._find_beta_distribution_parameters( X[np.nonzero( np.array(encoded_data.labels[self.label.name]) == self.class_mapping[1])], self.N_1)
def make_html_map(state: DatasetExportState, base_path: Path) -> dict: html_map = { "css_style": Util.get_css_content(DatasetExportHTMLBuilder.CSS_PATH), "name": state.name, 'immuneML_version': MLUtil.get_immuneML_version(), "full_specs": Util.get_full_specs_path(base_path), "datasets": [ { "dataset_name": dataset.name, "dataset_type": StringHelper.camel_case_to_word_string(type(dataset).__name__), "dataset_size": f"{dataset.get_example_count()} {type(dataset).__name__.replace('Dataset', 's').lower()}", "labels": [{"label_name": label} for label in dataset.get_label_names()], "preprocessing_sequence": [ { "preprocessing_name": preprocessing.__class__.__name__, "preprocessing_params": ", ".join([f"{key}: {value}" for key, value in vars(preprocessing).items()]) } for preprocessing in state.preprocessing_sequence ] if state.preprocessing_sequence is not None else [], "show_preprocessing": state.preprocessing_sequence is not None and len(state.preprocessing_sequence) > 0, "formats": [ { "format_name": format_name, "dataset_download_link": os.path.relpath(path=Util.make_downloadable_zip(state.result_path, state.paths[dataset.name][format_name]), start=base_path) } for format_name in state.formats ] } for dataset in state.datasets ] } return html_map
def make_html_map(state: MLApplicationState, base_path: Path) -> dict: return { "css_style": Util.get_css_content(MLApplicationHTMLBuilder.CSS_PATH), "hp_setting": state.hp_setting.get_key(), 'immuneML_version': MLUtil.get_immuneML_version(), "label": state.label_config.get_labels_by_name()[0], "dataset_name": state.dataset.name, "dataset_type": StringHelper.camel_case_to_word_string( type(state.dataset).__name__), "example_count": state.dataset.get_example_count(), "dataset_size": f"{state.dataset.get_example_count()} {type(state.dataset).__name__.replace('Dataset', 's').lower()}", "labels": [{ "name": label_name, "values": str(state.label_config.get_label_values(label_name))[1:-1] } for label_name in state.label_config.get_labels_by_name()], "predictions": Util.get_table_string_from_csv(state.predictions_path), "predictions_download_link": os.path.relpath(state.predictions_path, base_path) }
def make_html_map(state: DatasetExportState, base_path: Path) -> dict: html_map = { "css_style": Util.get_css_content(DatasetExportHTMLBuilder.CSS_PATH), "name": state.name, 'immuneML_version': MLUtil.get_immuneML_version(), "full_specs": Util.get_full_specs_path(base_path), "datasets": [{ "dataset_name": dataset.name, "dataset_type": StringHelper.camel_case_to_word_string(type(dataset).__name__), "dataset_size": f"{dataset.get_example_count()} {type(dataset).__name__.replace('Dataset', 's').lower()}", "labels": [{ "label_name": label } for label in dataset.get_label_names()], "formats": [{ "format_name": format_name, "dataset_download_link": os.path.relpath(path=Util.make_downloadable_zip( state.result_path, state.paths[dataset.name][format_name]), start=base_path) } for format_name in state.formats] } for dataset in state.datasets] } return html_map
def predict(self, encoded_data: EncodedData, label_name: str): self.check_is_fitted(label_name) predictions = self.model.predict(encoded_data.examples) return { label_name: Util.map_to_old_class_values(np.array(predictions), self.class_mapping) }
def make_html_map(state: ExploratoryAnalysisState, base_path: Path) -> dict: html_map = { "css_style": Util.get_css_content(ExploratoryAnalysisHTMLBuilder.CSS_PATH), "full_specs": Util.get_full_specs_path(base_path), 'immuneML_version': MLUtil.get_immuneML_version(), "analyses": [{ "name": name, "dataset_name": analysis.dataset.name if analysis.dataset.name is not None else analysis.dataset.identifier, "dataset_type": StringHelper.camel_case_to_word_string( type(analysis.dataset).__name__), "example_count": analysis.dataset.get_example_count(), "dataset_size": f"{analysis.dataset.get_example_count()} {type(analysis.dataset).__name__.replace('Dataset', 's').lower()}", "show_labels": analysis.label_config is not None and len(analysis.label_config.get_labels_by_name()) > 0, "labels": [{ "name": label.name, "values": str(label.values)[1:-1] } for label in analysis.label_config.get_label_objects()] if analysis.label_config else None, "encoding_key": analysis.encoder.name if analysis.encoder is not None else None, "encoding_name": StringHelper.camel_case_to_word_string( type(analysis.encoder).__name__) if analysis.encoder is not None else None, "encoding_params": [{ "param_name": key, "param_value": value } for key, value in vars(analysis.encoder).items()] if analysis.encoder is not None else None, "show_encoding": analysis.encoder is not None, "report": Util.to_dict_recursive(analysis.report_result, base_path) } for name, analysis in state.exploratory_analysis_units.items()] } for analysis in html_map["analyses"]: analysis["show_tables"] = len( analysis["report"]["output_tables"] ) > 0 if "output_tables" in analysis["report"] else False analysis["show_text"] = len( analysis["report"]["output_text"] ) > 0 if "output_text" in analysis["report"] else False return html_map
def fit_by_cross_validation(self, encoded_data: EncodedData, number_of_splits: int = 5, label: Label = None, cores_for_training: int = -1, optimization_metric='balanced_accuracy'): self.class_mapping = Util.make_class_mapping( encoded_data.labels[label.name]) self.feature_names = encoded_data.feature_names self.label = label mapped_y = Util.map_to_new_class_values( encoded_data.labels[self.label.name], self.class_mapping) self.model = self._fit_by_cross_validation(encoded_data.examples, mapped_y, number_of_splits, label, cores_for_training, optimization_metric)
def _make_html_map(report_results: dict, result_path: Path, instruction_result_paths: dict) -> dict: html_map = { "css_style": Util.get_css_content(MultiDatasetBenchmarkHTMLBuilder.CSS_PATH), "reports": Util.to_dict_recursive(report_results.values(), result_path), 'immuneML_version': MLUtil.get_immuneML_version(), "show_reports": True, "instruction_overviews": [{"name": name, "path": Path(os.path.relpath(path / "index.html", result_path))} for name, path in instruction_result_paths.items()] } if len(html_map['reports']) == 0: html_map['show_reports'] = False return html_map
def _make_roc_curve(self, hp_item: HPItem, label_name: str, proba_name: str) -> dict: df = pd.read_csv(hp_item.test_predictions_path) true_y = df[f"{label_name}_true_class"].values predicted_y = df[proba_name].values true_y = Util.map_to_new_class_values( true_y, hp_item.method.get_class_mapping()) fpr, tpr, _ = roc_curve(y_true=true_y, y_score=predicted_y) return { "FPR": fpr, "TPR": tpr, "AUC": roc_auc_score(true_y=true_y, predicted_y=predicted_y), "HPItem": str(hp_item.hp_setting) }
def _make_main_html_map(state: TrainMLModelState, base_path: Path) -> dict: html_map = { "css_style": Util.get_css_content(HPHTMLBuilder.CSS_PATH), "full_specs": Util.get_full_specs_path(base_path), "dataset_name": state.dataset.name if state.dataset.name is not None else state.dataset.identifier, "dataset_type": StringHelper.camel_case_to_word_string( type(state.dataset).__name__), "example_count": state.dataset.get_example_count(), "dataset_size": f"{state.dataset.get_example_count()} {type(state.dataset).__name__.replace('Dataset', 's').lower()}", "labels": [{ "name": label.name, "values": str(label.values)[1:-1] } for label in state.label_configuration.get_label_objects()], "optimization_metric": state.optimization_metric.name.lower(), "other_metrics": str([metric.name.lower() for metric in state.metrics])[1:-1].replace("'", ""), "metrics": [{ "name": metric.name.lower() } for metric in state.metrics], "assessment_desc": state.assessment, "selection_desc": state.selection, "show_hp_reports": bool(state.report_results), 'hp_reports': Util.to_dict_recursive(state.report_results, base_path) if state.report_results else None, "hp_per_label": HPHTMLBuilder._make_hp_per_label(state), 'models_per_label': HPHTMLBuilder._make_model_per_label(state, base_path), 'immuneML_version': MLUtil.get_immuneML_version() } return html_map
def make_html_map(state: SimulationState, base_path: Path) -> dict: html_map = { "css_style": Util.get_css_content(SimulationHTMLBuilder.CSS_PATH), "name": state.name, 'immuneML_version': MLUtil.get_immuneML_version(), "full_specs": Util.get_full_specs_path(base_path), "dataset_name": state.resulting_dataset.name if state.resulting_dataset.name is not None else state.resulting_dataset.identifier, "dataset_type": StringHelper.camel_case_to_word_string( type(state.resulting_dataset).__name__), "example_count": state.resulting_dataset.get_example_count(), "dataset_size": f"{state.resulting_dataset.get_example_count()} {type(state.resulting_dataset).__name__.replace('Dataset', 's').lower()}", "labels": [{ "label_name": label } for label in state.resulting_dataset.get_label_names()], "formats": [{ "format_name": format_name, "dataset_download_link": os.path.relpath(path=Util.make_downloadable_zip( state.result_path, state.paths[state.resulting_dataset.name][format_name]), start=base_path) } for format_name in state.formats], "implantings": [ Util.to_dict_recursive(implanting, base_path) for implanting in state.simulation.implantings ] } return html_map
def make_html_map(state: SubsamplingState, base_path: Path) -> dict: html_map = { "css_style": Util.get_css_content(SubsamplingHTMLBuilder.CSS_PATH), "name": state.name, 'immuneML_version': MLUtil.get_immuneML_version(), "full_specs": Util.get_full_specs_path(base_path), "dataset_name": state.dataset.name if state.dataset.name is not None else state.dataset.identifier, "labels": [{ "label_name": label } for label in state.dataset.get_label_names()], "dataset_type": StringHelper.camel_case_to_word_string( type(state.dataset).__name__), "example_count": state.dataset.get_example_count(), "subsampled_datasets": [{ "sub_dataset_iter": i, "sub_dataset_name": dataset.name, "dataset_size": f"{dataset.get_example_count()} {type(dataset).__name__.replace('Dataset', 's').lower()}", "formats": [{ "dataset_download_link": item, "format_name": key } for key, item in state.subsampled_dataset_paths[ dataset.name].items()] } for i, dataset in enumerate(state.subsampled_datasets, 1)] } return html_map
def fit(self, encoded_data: EncodedData, label: Label, cores_for_training: int = 2): self.feature_names = encoded_data.feature_names Util.setup_pytorch(self.number_of_threads, self.random_seed) self.input_size = encoded_data.examples.shape[1] self._make_log_reg() self.label = label self.class_mapping = Util.make_binary_class_mapping( encoded_data.labels[self.label.name]) loss = np.inf state = {"loss": loss, "model": None} loss_func = torch.nn.BCEWithLogitsLoss(reduction='mean') optimizer = torch.optim.SGD(self.logistic_regression.parameters(), lr=self.learning_rate) for iteration in range(self.iteration_count): # reset gradients optimizer.zero_grad() # compute predictions only for k-mers with max score max_logit_indices = self._get_max_logits_indices( encoded_data.examples) example_count = encoded_data.examples.shape[0] examples = torch.from_numpy(encoded_data.examples).float()[ torch.arange(example_count).long(), :, max_logit_indices] logits = self.logistic_regression(examples) # compute the loss loss = loss_func( logits, torch.tensor(encoded_data.labels[self.label.name]).float()) # perform update loss.backward() optimizer.step() # log current score and keep model for early stopping if specified if iteration % self.evaluate_at == 0 or iteration == self.iteration_count - 1: logging.info( f"AtchleyKmerMILClassifier: log loss at iteration {iteration+1}/{self.iteration_count}: {loss}." ) if state["loss"] < loss and self.use_early_stopping: state = { "loss": loss.numpy(), "model": copy.deepcopy(self.logistic_regression) } if loss < self.threshold: break logging.warning( f"AtchleyKmerMILClassifier: the logistic regression model did not converge." ) if loss > state['loss'] and self.use_early_stopping: self.logistic_regression.load_state_dict(state["model"])
def get_package_info(self) -> str: return Util.get_immuneML_version()
def _make_document(presentations: List[InstructionPresentation], path: Path) -> Path: result_path = path / "index.html" if len(presentations) > 1: html_map = {"instructions": presentations, "css_path": EnvironmentSettings.html_templates_path / "css/custom.css", "full_specs": Util.get_full_specs_path(path), 'immuneML_version': MLUtil.get_immuneML_version()} TemplateParser.parse(template_path=EnvironmentSettings.html_templates_path / "index.html", template_map=html_map, result_path=result_path) elif len(presentations) == 1: shutil.copyfile(str(presentations[0].path), str(result_path)) HTMLBuilder._update_paths(result_path) else: result_path = None return result_path
def get_package_info(self) -> str: return 'immuneML ' + Util.get_immuneML_version( ) + '; deepRC ' + pkg_resources.get_distribution('DeepRC').version