Exemplo n.º 1
0
def annotate_anndata_object(input_file: str, annotation: str) -> None:
    """ For command line use.
        annotation:  anno_name:clust_name:cell_type1;...cell_typen
    """
    from pegasus.io import read_input, write_output

    data = read_input(input_file, h5ad_mode="r+")
    anno_name, clust_name, anno_str = annotation.split(":")
    anno_dict = {str(i + 1): x for i, x in enumerate(anno_str.split(";"))}
    annotate(data, anno_name, clust_name, anno_dict)
    write_output(data, input_file, whitelist=["obs", "uns"])
Exemplo n.º 2
0
def run_de_analysis(
    input_file: str,
    output_excel_file: str,
    cluster: str,
    result_key: str = "de_res",
    n_jobs: int = -1,
    auc: bool = True,
    t: bool = True,
    fisher: bool = False,
    mwu: bool = False,
    temp_folder: str = None,
    verbose: bool = True,
    alpha: float = 0.05,
    ndigits: int = 3,
) -> None:
    """ For command line only
    """
    start = time.time()

    from pegasus.io import read_input, write_output

    data = read_input(input_file, h5ad_mode="r+")

    de_analysis(
        data,
        cluster,
        result_key=result_key,
        n_jobs=n_jobs,
        auc=auc,
        t=t,
        fisher=fisher,
        mwu=mwu,
        temp_folder=temp_folder,
        verbose=verbose,
    )

    write_output(data, input_file, whitelist=["varm/{}".format(result_key)])
    logger.info(
        "Differential expression results are written to varm/{} in h5ad file.".
        format(result_key))

    results = markers(data, de_key=result_key, alpha=alpha)

    write_results_to_excel(results, output_excel_file, ndigits=ndigits)

    end = time.time()
    logger.info("run_de_analysis is finished in {:.2f}s.".format(end - start))
Exemplo n.º 3
0
def down_sample(molecule_info_file,
                output_name,
                total_reads,
                n_sample,
                random_state=0):
    with tables.open_file(molecule_info_file) as h5_in:
        barcode_idx = h5_in.get_node("/barcode_idx").read()
        feature_idx = h5_in.get_node("/feature_idx").read()
        count = h5_in.get_node("/count").read()

        new_count = sample_hypergeom(count, total_reads, n_sample,
                                     random_state)

        idx = new_count > 0
        barcode_idx = barcode_idx[idx]
        feature_idx = feature_idx[idx]

        barcodes = h5_in.get_node("/barcodes").read()
        gene_ids = h5_in.get_node("/features/id").read()
        gene_names = h5_in.get_node("/features/name").read()

        genome = h5_in.get_node("/barcode_info/genomes").read()[0].decode()

        row_ind, col_ind, data = generate_sparse_matrix(
            barcode_idx, feature_idx)
        mat = csr_matrix((data, (row_ind, col_ind)),
                         shape=(barcodes.size, gene_ids.size))

        data = MemData()
        data.addData(
            genome,
            Array2D(
                {"barcodekey": barcodes},
                {
                    "featurekey": gene_ids,
                    "featurename": gene_names
                },
                mat,
            ),
        )

        write_output(data, output_name)

        print("Subsampled raw matrix is generated!")
Exemplo n.º 4
0
def merge_rna_and_adt_data(input_raw_h5, input_csv, antibody_control_csv,
                           output_name):
    data = read_input(input_raw_h5, return_type="MemData")
    print("Loaded the RNA matrix.")

    keyword = "CITE_Seq_" + data.listKeys()[0]
    data_citeseq = read_input(input_csv, return_type="MemData", genome=keyword)
    print("Loaded the ADT matrix.")

    array2d = data_citeseq.getData(keyword)
    if antibody_control_csv is None:
        array2d.matrix = array2d.matrix.log1p()
    else:
        size = array2d.feature_metadata.shape[0]
        idx = np.zeros(size, dtype=bool)
        antibody_to_pos = pd.Series(data=range(size),
                                    index=array2d.feature_metadata.index)

        adt_matrix = array2d.matrix.toarray().astype(float)

        series = pd.read_csv(antibody_control_csv,
                             header=0,
                             index_col=0,
                             squeeze=True)
        for antibody, control in series.iteritems():
            pos_a = antibody_to_pos[antibody]
            pos_c = antibody_to_pos[control]
            idx[pos_a] = True
            # convert to log expression
            adt_matrix[:, pos_a] = np.maximum(
                np.log(adt_matrix[:, pos_a] + 1.0) -
                np.log(adt_matrix[:, pos_c] + 1.0),
                0.0,
            )

        array2d.feature_metadata = array2d.feature_metadata[idx]
        array2d.matrix = csr_matrix(adt_matrix[:, idx])

    data.addData(keyword, array2d)
    write_output(data, output_name)

    print("Merged output is written.")
Exemplo n.º 5
0
def aggregate_matrices(
    csv_file: str,
    what_to_return: str = 'AnnData',
    restrictions: List[str] = [],
    attributes: List[str] = [],
    google_cloud: bool = False,
    select_singlets: bool = False,
    ngene: int = None,
    concat_matrices: bool = False,
) -> "None or AnnData or MemData":
    """Aggregate channel-specific count matrices into one big count matrix.

    This function takes as input a csv_file, which contains at least 2 columns — Sample, sample name; Location, file that contains the count matrices (e.g. filtered_gene_bc_matrices_h5.h5), and merges matrices from the same genome together. Depending on what_to_return, it can output the merged results into a pegasus-formatted HDF5 file or return as an AnnData or MemData object.

    Parameters
    ----------

    csv_file : `str`
        The CSV file containing information about each channel.
    what_to_return : `str`, optional (default: 'AnnData')
        If this value is equal to 'AnnData' or 'MemData', an AnnData or MemData object will be returned. Otherwise, results will be written into 'what_to_return.h5sc' file and None is returned.
    restrictions : `list[str]`, optional (default: [])
        A list of restrictions used to select channels, each restriction takes the format of name:value,…,value or name:~value,..,value, where ~ refers to not.
    attributes : `list[str]`, optional (default: [])
        A list of attributes need to be incorporated into the output count matrix.
    google_cloud : `bool`, optional (default: False)
        If the channel-specific count matrices are stored in a google bucket.
    select_singlets : `bool`, optional (default: False)
        If we have demultiplexed data, turning on this option will make pegasus only include barcodes that are predicted as singlets.
    ngene : `int`, optional (default: None)
        The minimum number of expressed genes to keep one barcode.
    concat_matrices : `bool`, optional (default: False)
        If concatenate multiple matrices. If so, return only one AnnData object, otherwise, might return a list of AnnData objects.

    Returns
    -------
    `None` or `AnnData` or `MemData`
        Either `None` or an `AnnData` object or a `MemData` object.

    Examples
    --------
    >>> pg.aggregate_matrix('example.csv', 'example_10x.h5', ['Source:pbmc', 'Donor:1'], ['Source', 'Platform', 'Donor'])
    """

    df = pd.read_csv(csv_file, header=0, index_col="Sample")
    df["Sample"] = df.index

    # Select channels
    rvec = [parse_restriction_string(x) for x in restrictions]

    idx = pd.Series([True] * df.shape[0], index=df.index, name="selected")
    for name, isin, content in rvec:
        assert name in df.columns
        if isin:
            idx = idx & df[name].isin(content)
        else:
            idx = idx & (~(df[name].isin(content)))

    df = df.loc[idx]

    if df.shape[0] == 0:
        raise ValueError("No channels pass the restrictions!")

    # Load channels
    tot = 0
    aggrData = MemData()
    dest_paths = []
    for sample_name, row in df.iterrows():
        input_file = os.path.expanduser(
            os.path.expandvars(row["Location"].rstrip(os.sep)))
        file_format, copy_path, copy_type = infer_file_format(input_file)
        if google_cloud:
            base_name = os.path.basename(copy_path)
            dest_path = sample_name + "_tmp_" + base_name

            if copy_type == "directory":
                check_call(["mkdir", "-p", dest_path])
                call_args = ["gsutil", "-m", "cp", "-r", copy_path, dest_path]
            else:
                call_args = ["gsutil", "-m", "cp", copy_path, dest_path]
            check_call(call_args)
            dest_paths.append(dest_path)

            input_file = dest_path
            if file_format == "csv" and copy_type == "directory":
                input_file = os.path.join(dest_path,
                                          os.path.basename(input_file))

        genome = None
        if file_format in ["dge", "csv", "mtx", "loom"]:
            assert "Reference" in row
            genome = row["Reference"]

        data = read_input(
            input_file,
            genome=genome,
            return_type="MemData",
            ngene=ngene,
            select_singlets=select_singlets,
        )
        data.update_barcode_metadata_info(sample_name, row, attributes)
        aggrData.addAggrData(data)

        tot += 1
        print("Processed {}.".format(input_file))

    # Delete temporary file
    for dest_path in dest_paths:
        check_call(["rm", "-rf", dest_path])

    # Merge channels
    t1 = time.time()
    aggrData.aggregate()
    t2 = time.time()
    print("Data aggregation is finished in {:.2f}s.".format(t2 - t1))

    if what_to_return == "AnnData":
        aggrData = aggrData.convert_to_anndata(concat_matrices)
    elif what_to_return != "MemData":
        write_output(aggrData, what_to_return)
        aggrData = None

    print("Aggregated {tot} files.".format(tot=tot))

    return aggrData
Exemplo n.º 6
0
def run_pipeline(input_file, output_name, **kwargs):
    is_raw = not kwargs["processed"]

    if "seurat_compatible" not in kwargs:
        kwargs["seurat_compatible"] = False

    # load input data
    adata = io.read_input(
        input_file,
        genome=kwargs["genome"],
        concat_matrices=False if kwargs["cite_seq"] else True,
        h5ad_mode=("a" if (is_raw or kwargs["subcluster"]) else "r+"),
        select_singlets=kwargs["select_singlets"],
        channel_attr=kwargs["channel_attr"],
        black_list=(
            kwargs["black_list"].split(",") if kwargs["black_list"] is not None else []
        ),
    )

    if not kwargs["cite_seq"]:
        if is_raw:
            values = adata.X.getnnz(axis=1)
            if values.min() == 0:  # 10x raw data
                adata._inplace_subset_obs(values >= kwargs["min_genes_on_raw"])
    else:
        data_list = adata
        assert len(data_list) == 2
        adata = cdata = None
        for i in range(len(data_list)):
            if data_list[i].uns["genome"].startswith("CITE_Seq"):
                cdata = data_list[i]
            else:
                adata = data_list[i]
        assert adata is not None and cdata is not None
    print("Inputs are loaded.")

    if kwargs["seurat_compatible"]:
        assert is_raw and kwargs["select_hvf"]

    if kwargs["subcluster"]:
        adata = tools.get_anndata_for_subclustering(adata, kwargs["subset_selections"])
        is_raw = True  # get submat and then set is_raw to True

    if is_raw:
        if not kwargs["subcluster"]:
            # filter out low quality cells/genes
            tools.run_filter_data(
                adata,
                output_filt=kwargs["output_filt"],
                plot_filt=kwargs["plot_filt"],
                plot_filt_figsize=kwargs["plot_filt_figsize"],
                mito_prefix=kwargs["mito_prefix"],
                min_genes=kwargs["min_genes"],
                max_genes=kwargs["max_genes"],
                min_umis=kwargs["min_umis"],
                max_umis=kwargs["max_umis"],
                percent_mito=kwargs["percent_mito"],
                percent_cells=kwargs["percent_cells"],
            )

            if kwargs["seurat_compatible"]:
                raw_data = adata.copy()  # raw as count

            # normailize counts and then transform to log space
            tools.log_norm(adata, kwargs["norm_count"])

            # set group attribute
            if kwargs["batch_correction"] and kwargs["group_attribute"] is not None:
                tools.set_group_attribute(adata, kwargs["group_attribute"])

        # select highly variable features
        if kwargs["select_hvf"]:
            tools.highly_variable_features(
                adata,
                kwargs["batch_correction"],
                flavor=kwargs["hvf_flavor"],
                n_top=kwargs["hvf_ngenes"],
                n_jobs=kwargs["n_jobs"],
            )
            if kwargs["hvf_flavor"] == "pegasus":
                if kwargs["plot_hvf"] is not None:
                    from pegasus.plotting import plot_hvf

                    robust_idx = adata.var["robust"].values
                    plot_hvf(
                        adata.var.loc[robust_idx, "mean"],
                        adata.var.loc[robust_idx, "var"],
                        adata.var.loc[robust_idx, "hvf_loess"],
                        adata.var.loc[robust_idx, "highly_variable_features"],
                        kwargs["plot_hvf"] + ".hvf.pdf",
                    )

        # batch correction
        if kwargs["batch_correction"]:
            tools.correct_batch(adata, features="highly_variable_features")

        # PCA
        tools.pca(
            adata,
            n_components=kwargs["nPC"],
            features="highly_variable_features",
            random_state=kwargs["random_state"],
        )

        # Find K neighbors
        tools.neighbors(
            adata,
            K=kwargs["K"],
            rep="pca",
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
            full_speed=kwargs["full_speed"],
        )

        # calculate diffmap
        if (
            kwargs["fle"]
            or kwargs["net_fle"]
        ):
            if not kwargs["diffmap"]:
                print("Turn on --diffmap option!")
            kwargs["diffmap"] = True

        if kwargs["diffmap"]:
            tools.diffmap(
                adata,
                n_components=kwargs["diffmap_ndc"],
                rep="pca",
                solver=kwargs["diffmap_solver"],
                random_state=kwargs["random_state"],
                max_t=kwargs["diffmap_maxt"],
            )
            if kwargs["diffmap_to_3d"]:
                tools.reduce_diffmap_to_3d(adata, random_state=kwargs["random_state"])

    # calculate kBET
    if ("kBET" in kwargs) and kwargs["kBET"]:
        stat_mean, pvalue_mean, accept_rate = tools.calc_kBET(
            adata,
            kwargs["kBET_batch"],
            K=kwargs["kBET_K"],
            alpha=kwargs["kBET_alpha"],
            n_jobs=kwargs["n_jobs"],
        )
        print(
            "kBET stat_mean = {:.2f}, pvalue_mean = {:.4f}, accept_rate = {:.2%}.".format(
                stat_mean, pvalue_mean, accept_rate
            )
        )

    # clustering
    if kwargs["spectral_louvain"]:
        tools.cluster(
            adata,
            algo="spectral_louvain",
            rep="pca",
            resolution=kwargs["spectral_louvain_resolution"],
            rep_kmeans=kwargs["spectral_louvain_basis"],
            n_clusters=kwargs["spectral_louvain_nclusters"],
            n_clusters2=kwargs["spectral_louvain_nclusters2"],
            n_init=kwargs["spectral_louvain_ninit"],
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
            class_label="spectral_louvain_labels",
        )

    if kwargs["spectral_leiden"]:
        tools.cluster(
            adata,
            algo="spectral_leiden",
            rep="pca",
            resolution=kwargs["spectral_leiden_resolution"],
            rep_kmeans=kwargs["spectral_leiden_basis"],
            n_clusters=kwargs["spectral_leiden_nclusters"],
            n_clusters2=kwargs["spectral_leiden_nclusters2"],
            n_init=kwargs["spectral_leiden_ninit"],
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
            class_label="spectral_leiden_labels",
        )

    if kwargs["louvain"]:
        tools.cluster(
            adata,
            algo="louvain",
            rep="pca",
            resolution=kwargs["louvain_resolution"],
            random_state=kwargs["random_state"],
            class_label=kwargs["louvain_class_label"],
        )

    if kwargs["leiden"]:
        tools.cluster(
            adata,
            algo="leiden",
            rep="pca",
            resolution=kwargs["leiden_resolution"],
            n_iter=kwargs["leiden_niter"],
            random_state=kwargs["random_state"],
            class_label=kwargs["leiden_class_label"],
        )

    # visualization
    if kwargs["net_tsne"]:
        tools.net_tsne(
            adata,
            rep="pca",
            n_jobs=kwargs["n_jobs"],
            perplexity=kwargs["tsne_perplexity"],
            random_state=kwargs["random_state"],
            select_frac=kwargs["net_ds_frac"],
            select_K=kwargs["net_ds_K"],
            select_alpha=kwargs["net_ds_alpha"],
            net_alpha=kwargs["net_l2"],
            polish_learning_frac=kwargs["net_tsne_polish_learing_frac"],
            polish_n_iter=kwargs["net_tsne_polish_niter"],
            out_basis=kwargs["net_tsne_basis"],
        )

    if kwargs["net_umap"]:
        tools.net_umap(
            adata,
            rep="pca",
            n_jobs=kwargs["n_jobs"],
            n_neighbors=kwargs["umap_K"],
            min_dist=kwargs["umap_min_dist"],
            spread=kwargs["umap_spread"],
            random_state=kwargs["random_state"],
            select_frac=kwargs["net_ds_frac"],
            select_K=kwargs["net_ds_K"],
            select_alpha=kwargs["net_ds_alpha"],
            full_speed=kwargs["full_speed"],
            net_alpha=kwargs["net_l2"],
            polish_learning_rate=kwargs["net_umap_polish_learing_rate"],
            polish_n_epochs=kwargs["net_umap_polish_nepochs"],
            out_basis=kwargs["net_umap_basis"],
        )

    if kwargs["net_fle"]:
        tools.net_fle(
            adata,
            output_name,
            n_jobs=kwargs["n_jobs"],
            K=kwargs["fle_K"],
            full_speed=kwargs["full_speed"],
            target_change_per_node=kwargs["fle_target_change_per_node"],
            target_steps=kwargs["fle_target_steps"],
            is3d=False,
            memory=kwargs["fle_memory"],
            random_state=kwargs["random_state"],
            select_frac=kwargs["net_ds_frac"],
            select_K=kwargs["net_ds_K"],
            select_alpha=kwargs["net_ds_alpha"],
            net_alpha=kwargs["net_l2"],
            polish_target_steps=kwargs["net_fle_polish_target_steps"],
            out_basis=kwargs["net_fle_basis"],
        )

    if kwargs["tsne"]:
        tools.tsne(
            adata,
            rep="pca",
            n_jobs=kwargs["n_jobs"],
            perplexity=kwargs["tsne_perplexity"],
            random_state=kwargs["random_state"],
        )

    if kwargs["fitsne"]:
        tools.fitsne(
            adata,
            rep="pca",
            n_jobs=kwargs["n_jobs"],
            perplexity=kwargs["tsne_perplexity"],
            random_state=kwargs["random_state"],
        )

    if kwargs["umap"]:
        tools.umap(
            adata,
            rep="pca",
            n_neighbors=kwargs["umap_K"],
            min_dist=kwargs["umap_min_dist"],
            spread=kwargs["umap_spread"],
            random_state=kwargs["random_state"],
        )

    if kwargs["fle"]:
        tools.fle(
            adata,
            output_name,
            n_jobs=kwargs["n_jobs"],
            K=kwargs["fle_K"],
            full_speed=kwargs["full_speed"],
            target_change_per_node=kwargs["fle_target_change_per_node"],
            target_steps=kwargs["fle_target_steps"],
            is3d=False,
            memory=kwargs["fle_memory"],
            random_state=kwargs["random_state"],
        )

    # calculate diffusion-based pseudotime from roots
    if len(kwargs["pseudotime"]) > 0:
        tools.calc_pseudotime(adata, kwargs["pseudotime"])

    # merge cite-seq data and run t-SNE
    if kwargs["cite_seq"]:
        adt_matrix = np.zeros((adata.shape[0], cdata.shape[1]), dtype="float32")
        idx = adata.obs_names.isin(cdata.obs_names)
        adt_matrix[idx, :] = cdata[adata.obs_names[idx],].X.toarray()
        if abs(100.0 - kwargs["cite_seq_capping"]) > 1e-4:
            cite_seq.capping(adt_matrix, kwargs["cite_seq_capping"])

        var_names = np.concatenate(
            [adata.var_names, ["AD-" + x for x in cdata.var_names]]
        )

        new_data = anndata.AnnData(
            X=hstack([adata.X, csr_matrix(adt_matrix)], format="csr"),
            obs=adata.obs,
            obsm=adata.obsm,
            uns=adata.uns,
            var={
                "var_names": var_names,
                "gene_ids": var_names,
                "n_cells": np.concatenate(
                    [adata.var["n_cells"].values, [0] * cdata.shape[1]]
                ),
                "percent_cells": np.concatenate(
                    [adata.var["percent_cells"].values, [0.0] * cdata.shape[1]]
                ),
                "robust": np.concatenate(
                    [adata.var["robust"].values, [False] * cdata.shape[1]]
                ),
                "highly_variable_features": np.concatenate(
                    [
                        adata.var["highly_variable_features"].values,
                        [False] * cdata.shape[1],
                    ]
                ),
            },
        )
        new_data.obsm["X_CITE-Seq"] = adt_matrix
        adata = new_data
        print("ADT count matrix is attached.")

        tools.fitsne(
            adata,
            rep="CITE-Seq",
            n_jobs=kwargs["n_jobs"],
            perplexity=kwargs["tsne_perplexity"],
            random_state=kwargs["random_state"],
            out_basis="citeseq_fitsne",
        )
        print("Antibody embedding is done.")

    if kwargs["seurat_compatible"]:
        seurat_data = adata.copy()
        seurat_data.raw = raw_data
        seurat_data.uns["scale.data"] = adata.uns["fmat_highly_variable_features"] # assign by reference
        seurat_data.uns["scale.data.rownames"] = adata.var_names[
            adata.var["highly_variable_features"]
        ].values
        io.write_output(seurat_data, output_name + ".seurat.h5ad")

    # write out results
    io.write_output(adata, output_name + ".h5ad")

    if kwargs["output_loom"]:
        io.write_output(adata, output_name + ".loom")

    print("Results are written.")
Exemplo n.º 7
0
 def execute(self):
     data = read_input(self.args["<input.h5sc>"], return_type="MemData")
     write_output(data, self.args["<output_10x.h5>"])
Exemplo n.º 8
0
def run_demuxEM_pipeline(input_adt_file, input_rna_file, output_name, **kwargs):
    # load input data
    adt = io.read_input(input_adt_file, genome="_ADT_")
    print("ADT file is loaded.")
    data = io.read_input(input_rna_file, genome=kwargs["genome"], concat_matrices=True)
    print("RNA file is loaded.")

    # Filter the RNA matrix
    data.obs["n_genes"] = data.X.getnnz(axis=1)
    data.obs["n_counts"] = data.X.sum(axis=1).A1
    obs_index = np.logical_and.reduce(
        (
            data.obs["n_genes"] >= kwargs["min_num_genes"],
            data.obs["n_counts"] >= kwargs["min_num_umis"],
        )
    )
    data._inplace_subset_obs(obs_index)
    data.var["robust"] = True

    # run demuxEM
    demuxEM.estimate_background_probs(adt, random_state=kwargs["random_state"])
    print("Background probability distribution is estimated.")
    demuxEM.demultiplex(
        data,
        adt,
        min_signal=kwargs["min_signal"],
        alpha=kwargs["alpha"],
        n_threads=kwargs["n_jobs"],
    )
    print("Demultiplexing is done.")

    # annotate raw matrix with demuxEM results
    genome_indexed_raw_data = io.read_input(
        input_rna_file, return_type="MemData", concat_matrices=False
    )
    for keyword in genome_indexed_raw_data.listKeys():
        array2d = genome_indexed_raw_data.getData(keyword)
        barcodes = array2d.barcode_metadata.index
        idx = barcodes.isin(data.obs_names)
        selected = barcodes[idx]

        demux_type = np.empty(barcodes.size, dtype="object")
        demux_type[:] = ""
        demux_type[idx] = data.obs.loc[selected, "demux_type"]
        array2d.barcode_metadata["demux_type"] = demux_type

        assignment = np.empty(barcodes.size, dtype="object")
        assignment[:] = ""
        assignment[idx] = data.obs.loc[selected, "assignment"]
        array2d.barcode_metadata["assignment"] = assignment

        if "assignment.dedup" in data.obs:
            assignment_dedup = np.empty(barcodes.size, dtype="object")
            assignment_dedup[:] = ""
            assignment_dedup[idx] = data.obs.loc[selected, "assignment.dedup"]
            array2d.barcode_metadata["assignment.dedup"] = assignment_dedup

    print("Demultiplexing results are added to raw expression matrices.")

    # generate plots
    if kwargs["gen_plots"]:
        demuxEM.plot_adt_hist(
            adt, "hto_type", output_name + ".ambient_hashtag.hist.pdf", alpha=1.0
        )
        demuxEM.plot_bar(
            adt.uns["background_probs"],
            adt.var_names,
            "Sample ID",
            "Background probability",
            output_name + ".background_probabilities.bar.pdf",
        )
        demuxEM.plot_adt_hist(
            adt, "rna_type", output_name + ".real_content.hist.pdf", alpha=0.5
        )
        demuxEM.plot_rna_hist(data, output_name + ".rna_demux.hist.pdf")
        print("Diagnostic plots are generated.")

    if len(kwargs["gen_gender_plot"]) > 0:
        tools.log_norm(data, 1e5)
        for gene_name in kwargs["gen_gender_plot"]:
            demuxEM.plot_violin(
                data,
                {"gene": gene_name},
                "{output_name}.{gene_name}.violin.pdf".format(
                    output_name=output_name, gene_name=gene_name
                ),
                title="{gene_name}: a gender-specific gene".format(gene_name=gene_name),
            )
        print("Gender-specific gene expression violin plots are generated.")

    # output results
    io.write_output(adt, output_name + "_ADTs.h5ad")
    print(
        "Hashtag count information is written to {output_name}_ADTs.h5ad .".format(
            output_name=output_name
        )
    )
    io.write_output(data, output_name + "_demux.h5ad")
    print(
        "Demutiplexed RNA expression information is written to {output_name}_demux.h5ad .".format(
            output_name=output_name
        )
    )
    io.write_output(genome_indexed_raw_data, output_name + "_demux")
    print(
        "Raw pegasus-format hdf5 file with demultiplexing results is written to {output_name}_demux.h5sc .".format(
            output_name=output_name
        )
    )

    # output summary statistics
    print("\nSummary statistics:")
    print("total\t{}".format(data.shape[0]))
    for name, value in data.obs["demux_type"].value_counts().iteritems():
        print("{}\t{}".format(name, value))