Пример #1
0
    def _generate(self) -> ReportResult:

        self.result_path = PathBuilder.build(self.result_path +
                                             f'{self.name}/')

        assert all(self.instruction_states[0].label_configuration.get_labels_by_name() == state.label_configuration.get_labels_by_name() and
                   self.instruction_states[0].label_configuration.get_label_values(
                       self.instruction_states[0].label_configuration.get_labels_by_name()[0]) ==
                   state.label_configuration.get_label_values(state.label_configuration.get_labels_by_name()[0])
                   for state in self.instruction_states), \
            "PerformanceOverview: there is a difference in labels between instructions, the plots cannot be created."
        assert len(self.instruction_states[0].label_configuration.get_labels_by_name()) == 1, \
            'PerformanceOverview: multiple labels were provided, but only one can be used in this report.'

        assert all(state.refit_optimal_model is False for state in self.instruction_states), \
            f"{PerformanceOverview.__name__}: no test datasets were available to assess the performance of optimal models as they were refitted on " \
            f"the full datasets. No reports will be generated."

        label = self.instruction_states[
            0].label_configuration.get_label_objects()[0]

        optimal_hp_items = [
            list(state.optimal_hp_items.values())[0]
            for state in self.instruction_states
        ]

        colors = px.colors.sequential.Viridis[::2][::-1]
        figure_auc, table_aucs = self.plot_roc(optimal_hp_items, label, colors)
        figure_pr, table_pr = self.plot_precision_recall(
            optimal_hp_items, label, colors)

        return ReportResult(output_figures=[figure_auc, figure_pr],
                            output_tables=table_aucs + table_pr)
Пример #2
0
 def _generate(self) -> ReportResult:
     PathBuilder.build(self.result_path)
     data_long_format = DataReshaper.reshape(self.dataset)
     table_result = self._write_results_table(data_long_format)
     report_output_fig = self._safe_plot(data_long_format=data_long_format)
     output_figures = None if report_output_fig is None else [report_output_fig]
     return ReportResult(self.name, output_figures, [table_result])
Пример #3
0
    def _generate(self) -> ReportResult:
        PathBuilder.build(self.result_path)

        test_metadata_filepath = self.test_dataset.encoded_data.info[
            'metadata_filepath']
        label_names = [self.label]
        hdf5_filepath = self.method._metadata_to_hdf5(test_metadata_filepath,
                                                      label_names)

        n_examples_test = len(self.test_dataset.encoded_data.example_ids)
        indices = np.array(range(n_examples_test))

        dataloader = self.method.make_data_loader(hdf5_filepath,
                                                  pre_loaded_hdf5_file=None,
                                                  indices=indices,
                                                  label=self.label,
                                                  eval_only=True,
                                                  is_train=False)

        model = self.method.get_model(self.label)[self.label]

        compute_contributions(intgrds_set_loader=dataloader,
                              deeprc_model=model,
                              n_steps=self.n_steps,
                              threshold=self.threshold,
                              resdir=self.result_path,
                              filename_inputs=self.filename_inputs,
                              filename_kernels=self.filename_kernels)

        return ReportResult(self.name,
                            output_figures=[
                                ReportOutput(self.filename_inputs),
                                ReportOutput(self.filename_kernels)
                            ])
Пример #4
0
    def _generate(self) -> ReportResult:

        figures, tables = [], []

        PathBuilder.build(self.result_path)

        if ReferenceSequenceOverlap._check_encoder_class(
                self.state.optimal_hp_items[self.label].encoder):
            figure, data = self._compute_optimal_model_overlap()
            figures.append(figure)
            tables.append(data)

        for assessment_state in self.state.assessment_states:
            encoder = assessment_state.label_states[
                self.label].optimal_assessment_item.encoder
            if ReferenceSequenceOverlap._check_encoder_class(encoder):
                figure_filename = f"{self.result_path}assessment_split_{assessment_state.split_index + 1}_model_vs_reference_overlap_{self.label}.pdf"
                df_filename = f"{self.result_path}assessment_split_{assessment_state.split_index + 1}_overlap_sequences_{self.label}"
                figure, data = self._compute_model_overlap(
                    figure_filename, df_filename, encoder,
                    f"overlap sequences between the model for assessment split "
                    f"{assessment_state.split_index + 1} and reference list")
                figures.append(figure)
                tables.append(data)

        return ReportResult(self.name,
                            output_figures=figures,
                            output_tables=tables)
Пример #5
0
    def _generate(self) -> ReportResult:
        data = self._generate_data()
        report_output_fig = self._safe_plot(data=data, output_written=False)
        output_figures = [report_output_fig
                          ] if report_output_fig is not None else []

        return ReportResult(self.name, output_figures=output_figures)
Пример #6
0
    def _generate(self) -> ReportResult:

        PathBuilder.build(self.result_path)

        matrix_result = self._export_matrix()
        details_result = self._export_details()
        label_result = self._export_labels()

        return ReportResult(self.name,
                            output_tables=[matrix_result],
                            output_text=[details_result, label_result])
Пример #7
0
    def _generate(self) -> ReportResult:
        PathBuilder.build(self.result_path)

        plotting_data = self._retrieve_plotting_data()
        result_table = self._write_results_table(plotting_data)
        report_output_fig = self._safe_plot(plotting_data=plotting_data)
        output_figures = [report_output_fig
                          ] if report_output_fig is not None else []

        return ReportResult(self.name,
                            output_tables=[result_table],
                            output_figures=output_figures)
Пример #8
0
    def _generate(self) -> ReportResult:

        PathBuilder.build(self.result_path)

        upper_limit, lower_limit = self.get_distribution_limits()
        self.result_name = "beta_distribution"

        report_output_fig = self._safe_plot(upper_limit=upper_limit, lower_limit=lower_limit, output_written=False)
        output_figures = [] if report_output_fig is None else [report_output_fig]

        return ReportResult(name="Beta distribution priors - probability that a sequence is disease-associated",
                            output_figures=output_figures)
Пример #9
0
    def _generate(self) -> ReportResult:
        self.result_path = PathBuilder.build(f"{self.result_path}/{self.name}/")
        self._extract_label()

        hp_items = [state.optimal_hp_items[self.label] for state in self.instruction_states]
        overlap_matrix = SequenceAnalysisHelper.compute_overlap_matrix(hp_items)

        labels = [state.dataset.name for state in self.instruction_states]
        figure_path = self._make_figure(overlap_matrix, labels)
        data_path = self._export_matrix(overlap_matrix, labels)

        return ReportResult(output_figures=[ReportOutput(figure_path, 'sequence overlap across datasets')],
                            output_tables=[ReportOutput(data_path, 'sequence overlap across datasets (csv)')])
Пример #10
0
    def _generate(self) -> ReportResult:

        df = pd.read_csv(
            self.dataset.encoded_data.info["relevant_sequence_path"])
        column_mapping = self._compute_column_mapping(df)
        df.rename(columns=column_mapping, inplace=True)

        PathBuilder.build(self.result_path)
        filename = f"{self.result_path}relevant_sequences.csv"
        df.to_csv(filename, index=False)

        return ReportResult(
            self.name,
            output_tables=[ReportOutput(filename, "relevant sequences")])
Пример #11
0
    def _generate(self) -> ReportResult:
        PathBuilder.build(self.result_path)
        report_result = ReportResult()
        sequence_alphabet = EnvironmentSettings.get_sequence_alphabet(self.method.sequence_type)
        for kernel_name in self.method.CNN.conv_chain_1 + self.method.CNN.conv_chain_2:
            figure_outputs, table_outputs = self._plot_kernels(kernel_name, sequence_alphabet)
            report_result.output_figures.extend(figure_outputs)
            report_result.output_tables.extend(table_outputs)

        figure_output, table_output = self._plot_fc_layer()
        report_result.output_figures.append(figure_output)
        report_result.output_tables.append(table_output)

        return report_result
Пример #12
0
    def _generate(self) -> ReportResult:
        PathBuilder.build(self.result_path)
        alpha_chains, beta_chains, trbv, trbj, subject_condition, count = [], [], [], [], [], []
        for index, receptor in enumerate(self.dataset.get_data()):
            alpha_chains.append(receptor.get_chain("alpha").amino_acid_sequence)
            beta_chains.append(receptor.get_chain("beta").amino_acid_sequence)
            trbv.append(receptor.get_chain("beta").metadata.v_gene)
            trbj.append(receptor.get_chain("beta").metadata.j_gene)
            subject_condition.append(f"{getattr(receptor.metadata, 'subject_id', str(index))}:{receptor.metadata[self.condition]}")
            count.append(receptor.get_chain("beta").metadata.count if receptor.get_chain('beta').metadata is not None else 1)

        df = pd.DataFrame({"CDR3b": beta_chains, "TRBV": trbv, "TRBJ": trbj, "CDR3a": alpha_chains, "subject:condition": subject_condition,
                           "count": count})
        file_path = self.result_path + "exported_data.tsv"
        df.to_csv(file_path, sep="\t", index=False)

        return ReportResult(self.name, output_tables=[ReportOutput(file_path, "exported data in GLIPH2 format")])
Пример #13
0
    def _generate(self) -> ReportResult:
        PathBuilder.build(self.result_path)

        tables, figures = [], []
        for label in self.state.label_configuration.get_labels_by_name():
            if self.compare_in_assessment:
                table, figure = self._generate_for_assessment(label)
                tables.append(table)
                figures.append(figure)
            if self.compare_in_selection:
                tmp_tables, tmp_figures = self._generate_for_selection(label)
                tables += tmp_tables
                figures += tmp_figures

        return ReportResult(self.name,
                            [fig for fig in figures if fig is not None],
                            [tab for tab in tables if tab is not None])
Пример #14
0
    def _generate(self) -> ReportResult:

        PathBuilder.build(self.result_path)
        self.result_name = f"{self.feature}_performance"

        training_dataframe, test_dataframe = self._make_plot_dataframes()
        table_results = self._store_dataframes(training_dataframe,
                                               test_dataframe)

        report_output_fig = self._plot(training_dataframe=training_dataframe,
                                       test_dataframe=test_dataframe)
        output_figures = None if report_output_fig is None else [
            report_output_fig
        ]

        return ReportResult(output_tables=table_results,
                            output_figures=output_figures)
Пример #15
0
    def _generate(self):

        report_output_tables = []

        if isinstance(self.dataset, RepertoireDataset):
            for repertoire in self.dataset.get_data():
                result_path = f"{self.result_path}/{repertoire.identifier}/"
                PathBuilder.build(result_path)
                report_output_tables = self.export_receptorlist(
                    repertoire.receptors, result_path)
        elif isinstance(self.dataset, ReceptorDataset):
            receptors = self.dataset.get_data()
            result_path = f"{self.result_path}/{self.dataset.identifier}/"
            PathBuilder.build(result_path)
            report_output_tables = self.export_receptorlist(
                receptors, result_path=result_path)

        return ReportResult(output_tables=report_output_tables)
Пример #16
0
    def _generate(self) -> ReportResult:

        self.label = list(self.train_dataset.encoded_data.labels.keys())[0]

        from source.util.TCRdistHelper import TCRdistHelper
        from tcrdist.rep_diff import hcluster_diff

        PathBuilder.build(self.result_path)

        subsampled_dataset = self._extract_positive_example_dataset()
        reference_sequences = self._extract_reference_sequences()
        tcr_rep = TCRdistHelper.compute_tcr_dist(subsampled_dataset,
                                                 [self.label], self.cores)
        tcr_rep.hcluster_df, tcr_rep.Z = hcluster_diff(
            clone_df=tcr_rep.clone_df,
            pwmat=tcr_rep.pw_alpha + tcr_rep.pw_beta,
            x_cols=["epitope"],
            count_col='count')

        figures, tables = [], []

        logging.info(
            f'{TCRdistMotifDiscovery.__name__}: created {tcr_rep.hcluster_df.shape[0]} clusters, now discovering motifs in clusters.'
        )

        for index, row in tcr_rep.hcluster_df.iterrows():
            if len(row['neighbors_i']) >= self.min_cluster_size:
                figure_outputs, table_outputs = self._discover_motif_in_cluster(
                    tcr_rep, index, row, reference_sequences)
                figures.extend(figure_outputs)
                tables.extend(table_outputs)

        res_summary = member_summ(res_df=tcr_rep.hcluster_df,
                                  clone_df=tcr_rep.clone_df,
                                  addl_cols=['epitope'])
        res_summary.to_csv(self.result_path + "tcrdist_summary.csv")

        tables.append(
            ReportOutput(path=self.result_path + "tcrdist_summary.csv",
                         name="TCRdist summary (csv)"))

        return ReportResult("TCRdist motif discovery", figures, tables)
Пример #17
0
    def _write_reports(self) -> ReportResult:
        all_matches_table = self._write_match_table()
        repertoire_sizes = self._write_repertoire_sizes()

        output_tables = [all_matches_table, repertoire_sizes]

        if self.dataset.encoded_data.encoding == "MatchedSequencesEncoder":
            output_tables += self._write_sequence_info(self.result_path +
                                                       "/sequence_info")
        else:
            if len(self.dataset.encoded_data.feature_annotations["chain"].
                   unique()) == 2:
                output_tables += self._write_paired_matches(self.result_path +
                                                            "/paired_matches")

            if self.dataset.encoded_data.encoding == "MatchedReceptorsEncoder":
                output_tables += self._write_receptor_info(self.result_path +
                                                           "/receptor_info")

        return ReportResult(self.name, output_tables=output_tables)
Пример #18
0
    def _generate(self):
        PathBuilder.build(self.result_path)
        paths = []

        self._set_plotting_parameters()

        plot_data = self._retrieve_plot_data()
        plot_data["abs_coefficients"] = abs(plot_data["coefficients"])
        plot_data.sort_values(by="abs_coefficients", inplace=True, ascending=False)

        result_table_path = self._write_results_table(plot_data[["features", "coefficients"]])
        self._write_settings()

        if CoefficientPlottingSetting.ALL in self._coefs_to_plot:
            report_output_fig = self._plot(plotting_data=plot_data, output_name="all_coefficients")
            paths.append(report_output_fig)

        if CoefficientPlottingSetting.NONZERO in self._coefs_to_plot:
            nonzero_data = plot_data[plot_data["coefficients"] != 0]
            report_output_fig = self._plot(plotting_data=nonzero_data, output_name="nonzero_coefficients")
            paths.append(report_output_fig)

        if CoefficientPlottingSetting.CUTOFF in self._coefs_to_plot:
            for cutoff_val in self._cutoff:
                cutoff_data = plot_data[plot_data["abs_coefficients"] >= cutoff_val]
                report_output_fig = self._plot(plotting_data=cutoff_data, output_name="cutoff_{}_coefficients".format(cutoff_val))
                paths.append(report_output_fig)

        if CoefficientPlottingSetting.N_LARGEST in self._coefs_to_plot:
            for n_val in self._n_largest:
                n_largest_data = plot_data.nlargest(n=n_val, columns=["abs_coefficients"])
                report_output_fig = self._plot(plotting_data=n_largest_data, output_name="largest_{}_coefficients".format(n_val))
                paths.append(report_output_fig)

        return ReportResult(self.name, output_tables=[ReportOutput(result_table_path, "features and coefficients csv")],
                            output_figures=[p for p in paths if p is not None])
Пример #19
0
 def _generate(self) -> ReportResult:
     sequence_lengths = self._get_sequence_lengths()
     report_output_fig = self._plot(sequence_lengths=sequence_lengths)
     output_figures = None if report_output_fig is None else [report_output_fig]
     return ReportResult(type(self).__name__, output_figures=output_figures)
Пример #20
0
 def _generate(self) -> ReportResult:
     PathBuilder.build(self.result_path)
     figure, tables = self._generate_sequence_length_distribution_plots()
     return ReportResult(name=self.name,
                         output_figures=[figure],
                         output_tables=tables)