示例#1
0
    def _generate(self) -> ReportResult:
        PathBuilder.build(self.result_path)

        test_metadata_filepath = self.test_dataset.encoded_data.info[
            'metadata_filepath']
        label_names = [self.label]
        hdf5_filepath = self.method._metadata_to_hdf5(test_metadata_filepath,
                                                      label_names)

        n_examples_test = len(self.test_dataset.encoded_data.example_ids)
        indices = np.array(range(n_examples_test))

        dataloader = self.method.make_data_loader(hdf5_filepath,
                                                  pre_loaded_hdf5_file=None,
                                                  indices=indices,
                                                  label=self.label,
                                                  eval_only=True,
                                                  is_train=False)

        model = self.method.get_model(self.label)[self.label]

        compute_contributions(intgrds_set_loader=dataloader,
                              deeprc_model=model,
                              n_steps=self.n_steps,
                              threshold=self.threshold,
                              resdir=self.result_path,
                              filename_inputs=self.filename_inputs,
                              filename_kernels=self.filename_kernels)

        return ReportResult(self.name,
                            output_figures=[
                                ReportOutput(self.filename_inputs),
                                ReportOutput(self.filename_kernels)
                            ])
示例#2
0
    def plot_roc(self, optimal_hp_items, label: Label,
                 colors) -> Tuple[ReportOutput, List[ReportOutput]]:
        report_data_outputs = []
        figure = go.Figure()

        figure.add_trace(
            go.Scatter(x=[0, 1],
                       y=[0, 1],
                       mode='lines',
                       name='baseline',
                       line=dict(color=PerformanceOverview.PLOTLY_BLACK,
                                 dash='dash'),
                       hoverinfo="skip"))

        for index, item in enumerate(optimal_hp_items):
            if item.test_predictions_path is None:
                logging.warning(
                    f'{PerformanceOverview.__name__}: there are no test predictions for dataset '
                    f'{self.instruction_states[index].dataset.name}, skipping this dataset when generating performance overview...'
                )
            else:

                df = pd.read_csv(item.test_predictions_path)
                true_class = df[f"{label.name}_true_class"].values
                predicted_class = df[
                    f"{label.name}_{label.positive_class}_proba"].values
                fpr, tpr, _ = metrics.roc_curve(y_true=true_class,
                                                y_score=predicted_class)
                auc = metrics.roc_auc_score(true_class, predicted_class)
                name = self.instruction_states[
                    index].dataset.name + f' (AUC = {round(auc, 2)})'
                figure.add_trace(
                    go.Scatter(x=fpr,
                               y=tpr,
                               mode='lines',
                               name=name,
                               marker=dict(color=colors[index],
                                           line=dict(width=3)),
                               hoverinfo="skip"))

                data_path = self.result_path + f"roc_curve_data_{name}.csv"
                pd.DataFrame({
                    "FPR": fpr,
                    "TPR": tpr
                }).to_csv(data_path, index=False)
                report_data_outputs.append(
                    ReportOutput(data_path,
                                 f'ROC curve data for dataset {name} (csv)'))

        figure_path = self.result_path + "roc_curve.html"
        figure.update_layout(template='plotly_white',
                             xaxis_title='false positive rate',
                             yaxis_title='true positive rate')
        figure.write_html(figure_path)

        return ReportOutput(figure_path, 'ROC curve'), report_data_outputs
示例#3
0
    def _generate(self) -> ReportResult:
        self.result_path = PathBuilder.build(f"{self.result_path}/{self.name}/")
        self._extract_label()

        hp_items = [state.optimal_hp_items[self.label] for state in self.instruction_states]
        overlap_matrix = SequenceAnalysisHelper.compute_overlap_matrix(hp_items)

        labels = [state.dataset.name for state in self.instruction_states]
        figure_path = self._make_figure(overlap_matrix, labels)
        data_path = self._export_matrix(overlap_matrix, labels)

        return ReportResult(output_figures=[ReportOutput(figure_path, 'sequence overlap across datasets')],
                            output_tables=[ReportOutput(data_path, 'sequence overlap across datasets (csv)')])
示例#4
0
    def _discover_motif_in_cluster(
        self,
        tcr_rep,
        index,
        row,
        negative_examples=None
    ) -> Tuple[List[ReportOutput], List[ReportOutput]]:
        from tcrdist.adpt_funcs import get_centroid_seq
        from palmotif import compute_pal_motif
        from palmotif import svg_logo

        dfnode = tcr_rep.clone_df.iloc[row['neighbors_i'], ]
        figure_outputs, table_outputs = [], []

        logging.info(
            f"{TCRdistMotifDiscovery.__name__}: in cluster {index+1}, there are {dfnode.shape[0]} neighbors."
        )

        for chain in ['a', 'b']:

            if dfnode.shape[0] > 2:
                centroid, *_ = get_centroid_seq(df=dfnode)
            else:
                centroid = dfnode[f'cdr3_{chain}_aa'].to_list()[0]

            motif, stat = compute_pal_motif(
                seqs=_select(df=tcr_rep.clone_df,
                             iloc_rows=row['neighbors_i'],
                             col=f'cdr3_{chain}_aa'),
                centroid=centroid,
                refs=negative_examples[chain]
                if self.use_reference_sequences else None)

            figure_path = self.result_path + f"motif_{chain}_{index + 1}.svg"
            svg_logo(motif, filename=figure_path)

            motif_data_path = self.result_path + f"motif_{chain}_{index + 1}.csv"
            motif.to_csv(motif_data_path)

            figure_outputs.append(
                ReportOutput(
                    figure_path,
                    f'Motif {index + 1} ({Chain.get_chain(chain.upper()).name.lower()} chain)'
                ))
            table_outputs.append(
                ReportOutput(
                    motif_data_path,
                    f'motif {index + 1} ({Chain.get_chain(chain.upper()).name.lower()} chain) csv data'
                ))

        return figure_outputs, table_outputs
示例#5
0
    def _store_dataframes(self, training_dataframe: pd.DataFrame,
                          test_dataframe: pd.DataFrame) -> List[ReportOutput]:
        train_path = self.result_path + "training_performance.csv"
        test_path = self.result_path + "test_performance.csv"
        training_dataframe.to_csv(train_path, index=False)
        test_dataframe.to_csv(test_path, index=False)

        return [
            ReportOutput(
                path=train_path,
                name=f"Training performance w.r.t. {self.feature} values"),
            ReportOutput(path=test_path,
                         name=f"Test performance w.r.t. {self.feature} values")
        ]
示例#6
0
    def _store_sequence_distribution_data(self, fig, dfs, chains):
        fig.write_html(self.result_path + "sequence_length_distribution.html")
        image_output = ReportOutput(
            self.result_path + "sequence_length_distribution.html",
            name="sequence length distribution per chain")
        table_outputs = [
            ReportOutput(
                self.result_path +
                f"sequence_length_distribution_chain_{chains[index]}.csv")
            for index in range(len(chains))
        ]
        for index, df in enumerate(dfs):
            df.to_csv(table_outputs[index].path, index=False)

        return image_output, table_outputs
示例#7
0
    def _write_repertoire_sizes(self):
        """
        Writes the repertoire sizes (# clones & # reads) per subject, per chain.
        """
        all_subjects = self.dataset.encoded_data.example_ids
        all_chains = sorted(
            set(self.dataset.encoded_data.feature_annotations["chain"]))

        results_df = pd.DataFrame(list(
            itertools.product(all_subjects, all_chains)),
                                  columns=["subject_id", "chain"])
        results_df["n_reads"] = 0
        results_df["n_clones"] = 0

        for repertoire in self.dataset.repertoires:
            rep_counts = repertoire.get_counts()
            rep_chains = repertoire.get_chains()

            for chain in all_chains:
                indices = rep_chains == Chain.get_chain(chain.upper())
                results_df.loc[(
                    results_df.subject_id == repertoire.metadata["subject_id"])
                               & (results_df.chain == chain),
                               'n_reads'] += np.sum(rep_counts[indices])
                results_df.loc[(
                    results_df.subject_id == repertoire.metadata["subject_id"])
                               & (results_df.chain == chain),
                               'n_clones'] += len(rep_counts[indices])

        results_path = os.path.join(self.result_path, "repertoire_sizes.csv")
        results_df.to_csv(results_path, index=False)

        return ReportOutput(results_path, "repertoire sizes")
示例#8
0
    def _write_paired_matches(self, paired_matches_path) -> List[ReportOutput]:
        PathBuilder.build(paired_matches_path)

        report_outputs = []
        for i in range(0, len(self.dataset.encoded_data.example_ids)
                       ):  # todo don't mention subject in the name twice
            filename = "example_{}_".format(
                self.dataset.encoded_data.example_ids[i])
            filename += "_".join([
                "{label}_{value}".format(label=label, value=values[i])
                for label, values in self.dataset.encoded_data.labels.items()
            ])
            filename += ".csv"
            filename = os.path.join(paired_matches_path, filename)

            if self.dataset.encoded_data.encoding == "MatchedReceptorsEncoder":
                self._write_paired_receptor_matches_for_repertoire(
                    self.dataset.encoded_data.examples[i], filename)
            elif self.dataset.encoded_data.encoding == "MatchedRegexEncoder":
                self._write_paired_regex_matches_for_repertoire(
                    self.dataset.encoded_data.examples[i], filename)

            report_outputs.append(
                ReportOutput(
                    filename,
                    f"example {self.dataset.encoded_data.example_ids[i]} paired matches"
                ))

        return report_outputs
    def _plot(self, data) -> ReportOutput:
        from rpy2.robjects import pandas2ri
        from rpy2.robjects.packages import STAP

        pandas2ri.activate()

        with open(EnvironmentSettings.root_path +
                  "source/visualization/SequencingDepthOverview.R") as f:
            string = f.read()

        plot = STAP(string, "plot")

        PathBuilder.build(self.result_path)

        plot.plot_sequencing_depth_overview(
            data=data[[self.x, "value", "frame_type", "feature", "id"] +
                      self.facets],
            x=self.x,
            color=self.color,
            facets=self.facets,
            palette=json.dumps(self.palette),
            nrow_distributions=self.nrow_distributions,
            nrow_scatterplot=self.nrow_scatterplot,
            height_distributions=self.height_distributions,
            height_scatterplot=self.height_scatterplot,
            width=self.width,
            result_path=self.result_path,
            result_name=self.result_name)
        return ReportOutput(path=f"{self.result_path}{self.result_name}.pdf")
示例#10
0
 def _export_matrix(self, overlap_matrix, filename,
                    row_col_names) -> ReportOutput:
     data_path = f"{self.result_path}{filename}.csv"
     pd.DataFrame(overlap_matrix,
                  columns=row_col_names,
                  index=row_col_names).to_csv(data_path)
     return ReportOutput(data_path,
                         " ".join(filename.split('_') + ['data']))
示例#11
0
    def _write_receptor_info(self, receptor_info_path) -> List[ReportOutput]:
        PathBuilder.build(receptor_info_path)

        receptor_chains = self.dataset.encoded_data.feature_annotations
        chain_types = receptor_chains["chain"].unique()

        first_chains = receptor_chains.loc[receptor_chains.chain ==
                                           chain_types[0]]
        second_chains = receptor_chains.loc[receptor_chains.chain ==
                                            chain_types[1]]

        first_chains.drop(columns=["chain"], inplace=True)
        second_chains.drop(columns=["chain"], inplace=True)

        on_cols = ["receptor_id"]
        if "clonotype_id" in second_chains.columns and first_chains.columns:
            on_cols += ["clonotype_id"]

        receptors = pd.merge(first_chains,
                             second_chains,
                             on=on_cols,
                             suffixes=(f"_{chain_types[0]}",
                                       f"_{chain_types[1]}"))

        unique_alpha_chains = first_chains.drop_duplicates(
            subset=["sequence", "v_gene", "j_gene"])
        unique_beta_chains = second_chains.drop_duplicates(
            subset=["sequence", "v_gene", "j_gene"])
        unique_receptors = receptors.drop_duplicates(subset=[
            f"sequence_{chain_types[0]}", f"v_gene_{chain_types[0]}",
            f"j_gene_{chain_types[0]}", f"sequence_{chain_types[1]}",
            f"v_gene_{chain_types[1]}", f"j_gene_{chain_types[1]}"
        ])

        receptor_chains_path = os.path.join(receptor_info_path,
                                            "all_chains.csv")
        receptor_chains.to_csv(receptor_chains_path, index=False)
        receptors_path = os.path.join(receptor_info_path, "all_receptors.csv")
        receptors.to_csv(receptors_path, index=False)
        unique_chain1_path = os.path.join(
            receptor_info_path, f"unique_{chain_types[0]}_chains.csv")
        unique_alpha_chains.to_csv(unique_chain1_path, index=False)
        unique_chain2_path = os.path.join(
            receptor_info_path, f"unique_{chain_types[1]}_chains.csv")
        unique_beta_chains.to_csv(unique_chain2_path, index=False)
        unique_receptors_path = os.path.join(receptor_info_path,
                                             "unique_receptors.csv")
        unique_receptors.to_csv(unique_receptors_path, index=False)

        return [
            ReportOutput(p) for p in [
                receptors_path, receptor_chains_path, unique_receptors_path,
                unique_chain1_path, unique_chain2_path
            ]
        ]
示例#12
0
    def _plot_fc_figure(self, df, bias):
        fig = make_subplots(rows=1, cols=2, column_widths=[0.8, 0.2], specs=[[{"type": "bar"}, {'type': "table"}]])
        fig.add_trace(go.Bar(x=df["names"], y=df["weights"], name="weights", hovertemplate='Weight for %{x}: %{y:.4f}<extra></extra>',
                             hoverlabel={"font_color": "white"}, marker_color=px.colors.diverging.Tealrose[0]), row=1, col=1)
        table = go.Table(header={"values": ["bias"]}, cells={"values": bias})
        table.cells.format = [[None], ['.3f']]
        fig.add_trace(table, row=1, col=2)
        fig.update_layout(template="plotly_white")
        fig.write_html(self.result_path + "fully_connected_layer_weights.html")

        return ReportOutput(self.result_path + "fully_connected_layer_weights.html", "fully-connected layer weights")
示例#13
0
    def plot_precision_recall(self, optimal_hp_items: list, label: Label,
                              colors):
        report_data_outputs = []
        figure = go.Figure()

        for index, item in enumerate(optimal_hp_items):
            df = pd.read_csv(item.test_predictions_path)

            true_class = df[f"{label.name}_true_class"].values
            predicted_proba = df[
                f"{label.name}_{label.positive_class}_proba"].values
            precision, recall, _ = precision_recall_curve(
                y_true=true_class, probas_pred=predicted_proba)
            name = self.instruction_states[index].dataset.name
            figure.add_trace(
                go.Scatter(x=recall,
                           y=precision,
                           mode='lines',
                           name=name,
                           marker=dict(color=colors[index],
                                       line=dict(width=3)),
                           hoverinfo="skip"))

            data_path = self.result_path + f"precision_recall_data_{name}.csv"
            pd.DataFrame({
                "precision": precision,
                "recall": recall
            }).to_csv(data_path, index=False)
            report_data_outputs.append(
                ReportOutput(
                    data_path,
                    f'precision-recall curve data for dataset {name}'))

        figure_path = self.result_path + "precision_recall_curve.html"
        figure.update_layout(template='plotly_white',
                             xaxis_title="recall",
                             yaxis_title="precision")
        figure.write_html(figure_path)

        return ReportOutput(figure_path,
                            'precision-recall curve'), report_data_outputs
示例#14
0
    def _export_details(self) -> ReportOutput:
        file_path = f"{self.result_path}encoding_details.yaml"
        with open(file_path, "w") as file:
            details = {
                "feature_names": self.dataset.encoded_data.feature_names,
                "encoding": self.dataset.encoded_data.encoding,
                "example_ids": list(self.dataset.encoded_data.example_ids)
            }

            yaml.dump(details, file)

        return ReportOutput(file_path, "encoding details")
示例#15
0
    def _write_match_table(self):
        id_df = pd.DataFrame(
            {"repertoire_id": self.dataset.encoded_data.example_ids})
        label_df = pd.DataFrame(self.dataset.encoded_data.labels)
        matches_df = pd.DataFrame(
            self.dataset.encoded_data.examples,
            columns=self.dataset.encoded_data.feature_names)

        result_path = os.path.join(self.result_path,
                                   "complete_match_count_table.csv")
        id_df.join(label_df).join(matches_df).to_csv(result_path, index=False)

        return ReportOutput(result_path, "All matches")
示例#16
0
    def _plot(self, sequence_lengths: Counter):

        df = pd.DataFrame({"counts": list(sequence_lengths.values()), 'sequence_lengths': list(sequence_lengths.keys())})

        figure = px.bar(df, x="sequence_lengths", y="counts")
        figure.update_layout(xaxis=dict(tickmode='array', tickvals=df["sequence_lengths"]), yaxis=dict(tickmode='array', tickvals=df["counts"]),
                             title="Sequence length distribution", template="plotly_white")
        figure.update_traces(marker_color=px.colors.diverging.Tealrose[0])
        PathBuilder.build(self.result_path)

        file_path = self.result_path + "sequence_length_distribution.html"
        figure.write_html(file_path)
        return ReportOutput(path=file_path, name="sequence length distribution plot")
    def _generate(self) -> ReportResult:

        df = pd.read_csv(
            self.dataset.encoded_data.info["relevant_sequence_path"])
        column_mapping = self._compute_column_mapping(df)
        df.rename(columns=column_mapping, inplace=True)

        PathBuilder.build(self.result_path)
        filename = f"{self.result_path}relevant_sequences.csv"
        df.to_csv(filename, index=False)

        return ReportResult(
            self.name,
            output_tables=[ReportOutput(filename, "relevant sequences")])
示例#18
0
    def _write_sequence_info(self, sequence_info_path) -> List[ReportOutput]:
        PathBuilder.build(sequence_info_path)

        chains = self.dataset.encoded_data.feature_annotations
        unique_chains = chains.drop_duplicates(
            subset=["sequence", "v_gene", "j_gene"])

        chains_path = os.path.join(sequence_info_path, "all_chains.csv")
        chains.to_csv(chains_path, index=False)
        unique_chains_path = os.path.join(sequence_info_path,
                                          "unique_chains.csv")
        unique_chains.to_csv(unique_chains_path, index=False)

        return [ReportOutput(p) for p in [chains_path, unique_chains_path]]
示例#19
0
    def _plot_kernels(self, kernel_name, sequence_alphabet):
        figure_outputs = []
        table_outputs = []
        friendly_kernel_name = copy(kernel_name).replace("chain_1", self.method.chain_names[0]).replace("chain_2", self.method.chain_names[1])

        for i in range(self.method.kernel_count):
            kernel = getattr(self.method.CNN, kernel_name)
            kernel_df = pd.DataFrame(kernel.weight[i].detach().numpy().T[:, :len(sequence_alphabet)], columns=sequence_alphabet)
            kernel_csv_path = self.result_path + friendly_kernel_name + f"_{i + 1}.csv"
            kernel_df.to_csv(kernel_csv_path, index=False)
            table_outputs.append(ReportOutput(kernel_csv_path, friendly_kernel_name + f"_{i + 1}"))

            logo = logomaker.Logo(kernel_df, shade_below=0.5, fade_below=0.5, font_name='Arial Rounded MT Bold', vpad=0.05, vsep=0.01)
            logo_path = self.result_path + friendly_kernel_name + f"_{i + 1}.png"

            logo.style_spines(visible=False)
            logo.style_spines(spines=('left', 'bottom'), visible=True)
            logo.style_xticks(fmt='%d', anchor=0)

            logo.fig.savefig(logo_path)
            plt.close(logo.fig)
            figure_outputs.append(ReportOutput(logo_path, friendly_kernel_name + f"_{i + 1}"))

        return figure_outputs, table_outputs
示例#20
0
    def _plot(self, plotting_data, output_name):
        if plotting_data.empty:
            logging.warning(f"Coefficients: empty data subset specified, skipping {output_name} plot...")
        else:

            filename = f"{self.result_path}{output_name}.html"

            figure = px.bar(plotting_data, x='features', y='coefficients', template='plotly_white',
                            title=f"{type(self.method).__name__}{' (' + self.method.name + ') - ' if self.method.name is not None else ' - '}"
                                  f"{' '.join(output_name.split('_'))}")
            figure.update_traces(marker_color=px.colors.sequential.Teal[3])

            figure.write_html(filename)

            return ReportOutput(filename)
    def _compute_model_overlap(self, figure_filename, df_filename, encoder,
                               name):

        reference_sequences_df = pd.read_csv(
            self.reference_path, usecols=self.comparison_attributes)
        reference_sequences = list(
            reference_sequences_df.to_records(index=False))
        attributes = reference_sequences_df.columns.tolist()

        model_sequences = self._extract_from_model(encoder)

        overlap_sequences = [
            sequence for sequence in model_sequences
            if sequence in reference_sequences
        ]
        count_overlap = len(overlap_sequences)
        count_ref_only = len([
            sequence for sequence in reference_sequences
            if sequence not in model_sequences
        ])
        count_model_only = len([
            sequence for sequence in model_sequences
            if sequence not in reference_sequences
        ])

        self._make_venn_diagram(count_ref_only, count_overlap,
                                count_model_only, 'reference', 'model',
                                figure_filename)
        figure = ReportOutput(figure_filename, name)

        pd.DataFrame.from_records(overlap_sequences,
                                  columns=attributes).to_csv(df_filename,
                                                             index=False)
        data = ReportOutput(df_filename, name)

        return figure, data
示例#22
0
 def _make_figure(self, overlap_matrix, filename,
                  row_col_names) -> ReportOutput:
     figure = px.imshow(overlap_matrix,
                        x=row_col_names,
                        y=row_col_names,
                        zmin=0,
                        zmax=100,
                        color_continuous_scale=px.colors.sequential.Teal,
                        template='plotly_white')
     figure.update_traces(
         hovertemplate=
         "Overlap of disease-associated<br>sequences between<br>%{x} and %{y}:<br>%{z}%<extra></extra>"
     )
     figure_path = f"{self.result_path}{filename}.html"
     figure.write_html(figure_path)
     return ReportOutput(figure_path, " ".join(filename.split('_')))
示例#23
0
    def _plot(self, plotting_data):
        plotting_data = self._preprocess_plotting_data(plotting_data)

        metric_name = self.state.optimization_metric.name.replace("_",
                                                                  " ").title()

        if self.single_axis_labels:
            figure = self._plot_single_axis_labels(
                plotting_data, "ML method", f"Performance ({metric_name})")
        else:
            figure = self._plot_rescalable(plotting_data, "ML method",
                                           f"Performance<br>({metric_name})")

        file_path = f"{self.result_path}{self.result_name}.html"
        figure.write_html(file_path)

        return ReportOutput(path=file_path)
示例#24
0
    def _generate(self) -> ReportResult:
        PathBuilder.build(self.result_path)
        alpha_chains, beta_chains, trbv, trbj, subject_condition, count = [], [], [], [], [], []
        for index, receptor in enumerate(self.dataset.get_data()):
            alpha_chains.append(receptor.get_chain("alpha").amino_acid_sequence)
            beta_chains.append(receptor.get_chain("beta").amino_acid_sequence)
            trbv.append(receptor.get_chain("beta").metadata.v_gene)
            trbj.append(receptor.get_chain("beta").metadata.j_gene)
            subject_condition.append(f"{getattr(receptor.metadata, 'subject_id', str(index))}:{receptor.metadata[self.condition]}")
            count.append(receptor.get_chain("beta").metadata.count if receptor.get_chain('beta').metadata is not None else 1)

        df = pd.DataFrame({"CDR3b": beta_chains, "TRBV": trbv, "TRBJ": trbj, "CDR3a": alpha_chains, "subject:condition": subject_condition,
                           "count": count})
        file_path = self.result_path + "exported_data.tsv"
        df.to_csv(file_path, sep="\t", index=False)

        return ReportResult(self.name, output_tables=[ReportOutput(file_path, "exported data in GLIPH2 format")])
示例#25
0
    def _generate(self) -> ReportResult:

        self.label = list(self.train_dataset.encoded_data.labels.keys())[0]

        from source.util.TCRdistHelper import TCRdistHelper
        from tcrdist.rep_diff import hcluster_diff

        PathBuilder.build(self.result_path)

        subsampled_dataset = self._extract_positive_example_dataset()
        reference_sequences = self._extract_reference_sequences()
        tcr_rep = TCRdistHelper.compute_tcr_dist(subsampled_dataset,
                                                 [self.label], self.cores)
        tcr_rep.hcluster_df, tcr_rep.Z = hcluster_diff(
            clone_df=tcr_rep.clone_df,
            pwmat=tcr_rep.pw_alpha + tcr_rep.pw_beta,
            x_cols=["epitope"],
            count_col='count')

        figures, tables = [], []

        logging.info(
            f'{TCRdistMotifDiscovery.__name__}: created {tcr_rep.hcluster_df.shape[0]} clusters, now discovering motifs in clusters.'
        )

        for index, row in tcr_rep.hcluster_df.iterrows():
            if len(row['neighbors_i']) >= self.min_cluster_size:
                figure_outputs, table_outputs = self._discover_motif_in_cluster(
                    tcr_rep, index, row, reference_sequences)
                figures.extend(figure_outputs)
                tables.extend(table_outputs)

        res_summary = member_summ(res_df=tcr_rep.hcluster_df,
                                  clone_df=tcr_rep.clone_df,
                                  addl_cols=['epitope'])
        res_summary.to_csv(self.result_path + "tcrdist_summary.csv")

        tables.append(
            ReportOutput(path=self.result_path + "tcrdist_summary.csv",
                         name="TCRdist summary (csv)"))

        return ReportResult("TCRdist motif discovery", figures, tables)
示例#26
0
    def _plot(self, upper_limit, lower_limit):
        from rpy2.robjects import pandas2ri
        from rpy2.robjects.packages import STAP

        pandas2ri.activate()

        with open(EnvironmentSettings.root_path + "source/visualization/StatDistributionPlot.R") as f:
            string = f.read()

        plot = STAP(string, "plot")

        plot.plot_beta_distribution_binary_class(alpha0=self.method.alpha_0, beta0=self.method.beta_0,
                                                 alpha1=self.method.alpha_1, beta1=self.method.beta_1,
                                                 x_label=f"probability that receptor sequence is {self.method.label_name}-associated",
                                                 label0=f"{self.method.label_name} {self.method.class_mapping[0]}",
                                                 label1=f"{self.method.label_name} {self.method.class_mapping[1]}",
                                                 upper_limit=upper_limit, lower_limit=lower_limit,
                                                 result_path=self.result_path,
                                                 result_name=self.result_name)

        return ReportOutput(f"{self.result_path}{self.result_name}.pdf")
示例#27
0
    def _plot(self, data_long_format) -> ReportOutput:
        groupby_cols = [self.x, self.color, self.facet_row, self.facet_column]
        groupby_cols = [i for i in groupby_cols if i]
        groupby_cols = list(set(groupby_cols))
        plotting_data = data_long_format.groupby(groupby_cols, as_index=False).agg(
            {"value": ['mean', self.std]})

        plotting_data.columns = plotting_data.columns.map(''.join)

        figure = px.bar(plotting_data, x=self.x, y="valuemean", color=self.color, barmode="relative",
                        facet_row=self.facet_row, facet_col=self.facet_column, error_y="valuestd",
                        labels={
                            "valuemean": self.y_title,
                            self.x: self.x_title,
                        }, template='plotly_white',
                        color_discrete_sequence=px.colors.diverging.Tealrose)

        file_path = f"{self.result_path}{self.result_name}.html"
        figure.write_html(file_path)

        return ReportOutput(path=file_path, name="feature bar plot")
示例#28
0
    def _plot(self, training_dataframe, test_dataframe):

        optimization_metric = self.state.optimization_metric.name.lower()

        fig = go.Figure()
        fig.add_trace(
            go.Scatter(x=training_dataframe["x"],
                       y=training_dataframe["y"],
                       name="training",
                       mode="markers",
                       marker_size=11,
                       marker_color="#CC79A7",
                       hovertemplate=f"training {optimization_metric}" +
                       ": %{y}<extra></extra>",
                       opacity=0.8))
        fig.add_trace(
            go.Scatter(x=test_dataframe["x"],
                       y=test_dataframe["y"],
                       name="test",
                       mode="markers",
                       marker_size=11,
                       marker_color="#009E73",
                       hovertemplate=f"test {optimization_metric}" +
                       ": %{y}<extra></extra>",
                       opacity=0.8))
        fig.update_layout(legend_title_text="Data",
                          title="Performance across feature values",
                          template="plotly_white")
        fig.update_xaxes(title_text=self.feature)
        if self.is_feature_axis_categorical:
            fig.update_xaxes(type='category')
        fig.update_yaxes(
            title_text=
            f"performance ({self.state.optimization_metric.name.lower()})")
        fig.update_layout(hovermode="x unified")

        file_path = f"{self.result_path}{self.result_name}.html"
        fig.write_html(file_path)

        return ReportOutput(path=file_path)
示例#29
0
    def _plot(self, data_long_format):
        from rpy2.robjects import pandas2ri
        from rpy2.robjects.packages import STAP

        pandas2ri.activate()

        with open(EnvironmentSettings.root_path +
                  "source/visualization/Distributions.R") as f:
            string = f.read()

        plot = STAP(string, "plot")

        plot.plot_distribution(data=data_long_format,
                               x=self.grouping_label,
                               y="value",
                               color=self.color,
                               group=self.group,
                               type=self.type,
                               facet_rows=self.facet_rows,
                               facet_columns=self.facet_columns,
                               facet_type=self.facet_type,
                               facet_scales=self.facet_scales,
                               facet_switch=self.facet_switch,
                               nrow=self.nrow,
                               ncol=self.ncol,
                               height=self.height,
                               width=self.width,
                               x_lab=self.x_title,
                               y_lab=self.y_title,
                               color_lab=self.color_title,
                               palette=self.palette,
                               result_path=self.result_path,
                               result_name=self.result_name)

        return ReportOutput(f"{self.result_path}{self.result_name}.pdf",
                            "feature dist plot")
示例#30
0
    def _generate(self):
        PathBuilder.build(self.result_path)
        paths = []

        self._set_plotting_parameters()

        plot_data = self._retrieve_plot_data()
        plot_data["abs_coefficients"] = abs(plot_data["coefficients"])
        plot_data.sort_values(by="abs_coefficients", inplace=True, ascending=False)

        result_table_path = self._write_results_table(plot_data[["features", "coefficients"]])
        self._write_settings()

        if CoefficientPlottingSetting.ALL in self._coefs_to_plot:
            report_output_fig = self._plot(plotting_data=plot_data, output_name="all_coefficients")
            paths.append(report_output_fig)

        if CoefficientPlottingSetting.NONZERO in self._coefs_to_plot:
            nonzero_data = plot_data[plot_data["coefficients"] != 0]
            report_output_fig = self._plot(plotting_data=nonzero_data, output_name="nonzero_coefficients")
            paths.append(report_output_fig)

        if CoefficientPlottingSetting.CUTOFF in self._coefs_to_plot:
            for cutoff_val in self._cutoff:
                cutoff_data = plot_data[plot_data["abs_coefficients"] >= cutoff_val]
                report_output_fig = self._plot(plotting_data=cutoff_data, output_name="cutoff_{}_coefficients".format(cutoff_val))
                paths.append(report_output_fig)

        if CoefficientPlottingSetting.N_LARGEST in self._coefs_to_plot:
            for n_val in self._n_largest:
                n_largest_data = plot_data.nlargest(n=n_val, columns=["abs_coefficients"])
                report_output_fig = self._plot(plotting_data=n_largest_data, output_name="largest_{}_coefficients".format(n_val))
                paths.append(report_output_fig)

        return ReportResult(self.name, output_tables=[ReportOutput(result_table_path, "features and coefficients csv")],
                            output_figures=[p for p in paths if p is not None])