示例#1
0
def make_st_dataset(cnt_pths : List[str],
                    topn_genes : bool = None,
                    min_counts : int = 0,
                    min_spots : int = 0,
                    filter_genes : bool = False,
                    transpose : bool = False,
                    keep_barcodes: bool = True,
                    )-> CountData :

    """
    Generate CountData object for ST-data

    Parameter:
    ---------

    cnt_pths : List[str]
        list of paths to ST-data

    topn_genes : bool
        number of top expressed genes to
        include in analysis

    min_counts : int
        minimal number of observed
        counts assigned to a specific
        spot/cell for it to be included

    min_occurance : int
        minimal number of occurances
        of a gene among all spots/cells
        for it to be included in analysis

    filter_genes : bool
        exclude MALAT1 and RP genes from
        analysis

    transpose : bool
        transpose data

    keep_barcodes: bool
        if original rownames should be
        used or whether rownames should
        be based on spatial coordinates.
        Only applicable when using h5ad files.


    Returns:
    -------

    CountData object for the ST data

    """

    # create joint matrix for count data

    st_ext = utils.get_extenstion(cnt_pths[0])

    if st_ext == "h5ad":
        cnt = utils.read_h5ad_st(cnt_pths,keep_barcodes)
    else:
        cnt = utils.make_joint_matrix(cnt_pths,
                                      transpose)

    # select top N genes if specified
    if topn_genes is not None:
        genesize = cnt.values.sum(axis = 0)
        topn_genes = np.min((topn_genes,genesize.shape[0]))
        sel = np.argsort(genesize)[::-1]
        sel = sel[0:topn_genes]
        cnt = cnt.iloc[:,sel]

    dataset = CountData(cnt)

    # filter genes based on name
    if filter_genes:
        dataset.filter_genes()

    # filter data based on quality
    if any([min_counts > 0,min_spots > 0]):
        dataset.filter_bad(min_counts = min_counts,
                           min_occurance = min_spots,
                           )


    return dataset
示例#2
0
def make_sc_dataset(cnt_pth : str,
                    lbl_pth : str,
                    topn_genes : int = None,
                    gene_list_pth : str = None,
                    filter_genes : bool = False,
                    lbl_colname : str = 'bio_celltype',
                    min_counts : int = 300,
                    min_cells : int = 0,
                    transpose : bool = False,
                    upper_bound : int = None,
                    lower_bound : int = None,
                    ):

    """
    Generate CountData object for SC-data

    Parameter:
    ---------

    cnt_pth : str
        path to SC count data

    lbl_pth : str
        path to SC label data

    topn_genes : bool
        number of top expressed genes to
        include

    gene_list_pth : str
        gene list

    lbl_colname : str
        name of column containing labels

    min_counts : int
        minimal number of observed
        counts assigned to a specific
        spot/cell for it to be included

    min_cells : int
        minimal number of occurances
        of a gene among all cells
        for it to be included

    transpose : bool
        transpose data

    lower_bound : int
          lower bound for the number of cells to
          include from each type
    upper_bound : int
         upper bound for the number of cells to
         include from each type


    Returns:
    -------

    CountData object for the SC data

    """

    sc_ext = utils.get_extenstion(cnt_pth)

    if sc_ext == 'h5ad' :
        cnt,lbl = utils.read_h5ad_sc(cnt_pth,
                                     lbl_colname,
                                     lbl_pth,
                                     )
    else:
        cnt = utils.read_file(cnt_pth,sc_ext)
        if transpose:
            cnt = cnt.T
        lbl = utils.read_file(lbl_pth)

        # get labels
        if lbl_colname is None:
            lbl = lbl.iloc[:,0]
        else:
            lbl = lbl.loc[:,lbl_colname]

    # match count and label data
    inter = cnt.index.intersection(lbl.index)
    if inter.shape[0] < 1:
        print("[ERROR] : single cell count and annotation"\
              " data did not match. Exiting.",
              file = sys.stderr,
              )
    cnt = cnt.loc[inter,:]
    lbl = lbl.loc[inter]


    if upper_bound is not None or\
       lower_bound is not None:
        cnt,lbl = utils.subsample_data(cnt,
                                       lbl,
                                       lower_bound,
                                       upper_bound,
                                       )

    # select top N expressed genes
    if topn_genes is not None:
        genesize = cnt.values.sum(axis = 0)
        topn_genes = np.min((topn_genes,genesize.shape[0]))
        sel = np.argsort(genesize)[::-1]
        sel = sel[0:topn_genes]
        cnt = cnt.iloc[:,sel]

    # only use genes in specific genes list
    # if specified
    if gene_list_pth is not None:
        with open(gene_list_pth,'r+') as fopen:
            gene_list = fopen.readlines()

        gene_list = pd.Index([ x.replace('\n','') for x in gene_list ])
        sel = cnt.columns.intersection(gene_list)
        cnt = cnt.loc[:,sel]

    # create sc data set
    dataset = CountData(cnt = cnt,
                        lbl = lbl)

    # filter genes based on names
    if filter_genes:
        dataset.filter_genes()

    # filter data based on quality
    if any([min_counts > 0,min_cells > 0]):
        dataset.filter_bad(min_counts = min_counts,
                           min_occurance = min_cells,
                          )

    return dataset