Exemplo n.º 1
0
def filter_intra_doublets(molecule_table: pd.DataFrame,
                          prop: float = 0.1) -> pd.DataFrame:
    """Filters cells that present too much conflicting allele information.

    For each cellBC, calculates the most common allele for each intBC by UMI
    count. Also calculates the proportion of UMIs of alleles that conflict
    with the most common. If the proportion across all UMIs is > prop, filters
    out alignments with that cellBC from the DataFrame.

    Args:
        molecule_table: A molecule table of cellBC-UMI pairs to be filtered
        prop: The threshold representing the minimum proportion of conflicting
        UMIs needed to filter out a cellBC from the DataFrame

    Returns
        A filtered molecule table
    """
    umis_per_allele = (molecule_table.groupby(
        ["cellBC", "intBC",
         "allele"])["UMI"].size().reset_index().sort_values("UMI",
                                                            ascending=False))
    umis_per_allele_unique = umis_per_allele.drop_duplicates(
        ["cellBC", "intBC"])
    umis_per_cellBC = umis_per_allele.groupby("cellBC")["UMI"].sum()
    conflicting_umis_per_cellBC = (
        umis_per_cellBC -
        umis_per_allele_unique.groupby("cellBC")["UMI"].sum())
    prop_multi_alleles_per_cellBC = (conflicting_umis_per_cellBC /
                                     umis_per_cellBC)
    passing_mask = prop_multi_alleles_per_cellBC <= prop
    passing_cellBCs = set(prop_multi_alleles_per_cellBC.index[passing_mask])
    logger.debug(
        f"Filtered {(~passing_mask).sum()} cellBCs with too much conflicitng "
        "allele information.")
    return molecule_table[molecule_table["cellBC"].isin(passing_cellBCs)]
Exemplo n.º 2
0
 def wrapper(*args, **kwargs):
     df = wrapped(*args, **kwargs)
     umi_count = df["UMI"].dtype != object
     logger.debug(
         f"Resulting {'alleletable' if umi_count else 'molecule_table'} statistics:"
     )
     logger.debug(f"# Reads: {df['readCount'].sum()}")
     logger.debug(
         f"# UMIs: {df['UMI'].sum() if umi_count else df.shape[0]}")
     logger.debug(f"# Cell BCs: {df['cellBC'].nunique()}")
     return df
Exemplo n.º 3
0
def map_intbcs(molecule_table: pd.DataFrame) -> pd.DataFrame:
    """Assign one allele to each intBC/cellBC pair.

    For each intBC/cellBC pairing, selects the most frequent allele (by read
    count, and then by UMI) and removes alignments that don't have that allele.

    Args:
        molecule_table: A molecule table of cellBC-UMI pairs to be filtered

    Returns:
        An allele table with one allele per cellBC-intBC pair
    """

    # Have to drop out all intBCs that are NaN
    molecule_table = molecule_table.dropna(subset=["intBC"])

    # For each cellBC-intBC pair, select the allele that has the highest
    # readCount; on ties, use UMI count
    allele_table = (
        molecule_table.groupby(["cellBC", "intBC", "allele"])
        .agg({"readCount": "sum", "UMI": "count"})
        .reset_index()
        .sort_values(["UMI", "readCount"], ascending=False)
    )
    duplicated_mask = allele_table.duplicated(["cellBC", "intBC"])
    mapped_alleles = set(
        allele_table[~duplicated_mask][
            ["cellBC", "intBC", "allele"]
        ].itertuples(index=False, name=None)
    )

    # True for rows that contain the mapped allele; False for ones to filter out
    selection_mask = (
        molecule_table[["cellBC", "intBC", "allele"]]
        .apply(tuple, axis=1)
        .isin(mapped_alleles)
    )

    mapped_table = molecule_table[selection_mask]
    logger.debug(f"Alleles removed: {duplicated_mask.sum()}")
    logger.debug(f"UMIs removed: {(~selection_mask).sum()}")
    return mapped_table
Exemplo n.º 4
0
def filter_inter_doublets(at: pd.DataFrame,
                          rule: float = 0.35) -> pd.DataFrame:
    """Filters out cells whose kinship with their assigned lineage is low.

    Essentially, filters out cells that have ambigious kinship across multiple
    lineage groups. For every cell, calculates the kinship it has with its
    assigned lineage, with kinship defined as the weighted proportion of intBCs
    it shares with the intBC set for a lineage (see compute_lg_membership for
    more details on the weighting). If that kinship is <= rule, then it is
    filtered out.

    Args:
        at: An allele table of cellBC-intBC-allele groups to be filtered
        rule: The minimum kinship threshold which a cell needs to pass in order
            to be included in the final DataFrame

    Returns:
        A filtered allele table
    """
    ibc_sets = {}
    dropouts = {}
    for lg_name, at_lg in at.groupby(["lineageGrp"]):
        ibc_sets[lg_name], dropouts[lg_name] = get_intbc_set(at_lg)

    # Calculate kinship for each lineage group for each cell
    n_filtered = 0
    passing_cellBCs = []
    for cellBC, at_cellBC in at.groupby("cellBC"):
        lg = int(at_cellBC["lineageGrp"].iloc[0])
        mem = compute_lg_membership(at_cellBC, ibc_sets, dropouts)
        if mem[lg] < rule:
            n_filtered += 1
        else:
            passing_cellBCs.append(cellBC)

    n_cells = at["cellBC"].nunique()
    logger.debug(f"Filtered {n_filtered} inter-doublets of {n_cells} cells")
    return at[at["cellBC"].isin(passing_cellBCs)]
Exemplo n.º 5
0
 def wrapper(*args, **kwargs):
     logger.debug(f"Keyword arguments: {kwargs}")
     return wrapped(*args, **kwargs)
Exemplo n.º 6
0
def call_lineage_groups(
    input_df: pd.DataFrame,
    output_directory: str,
    min_umi_per_cell: int = 10,
    min_avg_reads_per_umi: float = 2.0,
    min_cluster_prop: float = 0.005,
    min_intbc_thresh: float = 0.05,
    inter_doublet_threshold: float = 0.35,
    kinship_thresh: float = 0.25,
    plot: bool = False,
) -> pd.DataFrame:
    """Assigns cells to their clonal populations.

    Performs multiple rounds of filtering and assigning to lineage groups:
        1. Iteratively generates putative lineage groups by forming intBC
        groups for each lineage group and then assigning cells based on how
        many intBCs they share with each intBC group (kinship).

        2. Refines these putative groups by removing non-informative intBCs
        and reassigning cells through kinship.

        3. Removes all inter-lineage doublets, defined as cells that have
        relatively equal kinship scores across multiple lineages and whose
        assignments are therefore ambigious.

        4. Finally, performs one more round of filtering non-informative intBCs
        and cellBCs with low UMI counts before returning a final table of
        lineage assignments, allele information, and read and umi counts for
        each sample.

    Args:
        input_df: The allele table of cellBC-UMI-allele groups to be annotated
            with lineage assignments
        output_directory: The folder to store the final table as well as plots
        min_umi_per_cell: The threshold specifying the minimum number of UMIs a
            cell needs in order to not be filtered during filtering
        min_avg_reads_per_umi: The threshold specifying the minimum coverage
            (i.e. average) reads per UMI in a cell needed in order for that
            cell not to be filtered during filtering
        min_cluster_prop: The minimum cluster size in the putative lineage
            assignment step, as a proportion of the number of cells
        min_intbc_thresh: The threshold specifying the minimum proportion of
            cells in a lineage group that need to have an intBC in order for it
            be retained during filtering. Also specifies the minimum proportion
            of cells that share an intBC with the most frequent intBC in
            forming putative lineage groups
        inter_doublet_threshold: The threshold specifying the minimum proportion
            of kinship a cell shares with its assigned lineage group out of all
            lineage groups for it to be retained during doublet filtering
        kinship_thresh: The threshold specifying the minimum proportion of
            intBCs shared between a cell and the intBC set of a lineage group
            needed to assign that cell to that lineage group in putative
            assignment
        plot: Indicates whether to generate plots

    Returns:
        None, saves output allele table to file.
    """
    logger.info(
        f"{input_df.shape[0]} UMIs (rows), with {input_df.shape[1]} attributes (columns)"
    )
    logger.info(str(len(input_df["cellBC"].unique())) + " Cells")

    # Create a pivot_table
    piv = pd.pivot_table(input_df,
                         index="cellBC",
                         columns="intBC",
                         values="UMI",
                         aggfunc="count")
    piv = piv.div(piv.sum(axis=1), axis=0)

    # Reorder piv columns by binarized intBC frequency
    pivbin = piv.copy()
    pivbin[pivbin > 0] = 1
    intBC_sums = pivbin.sum(0)
    ordered_intBCs = intBC_sums.sort_values(ascending=False).index.tolist()
    piv = piv[ordered_intBCs]
    min_clust_size = int(min_cluster_prop * piv.shape[0])

    logger.info("Assigning initial lineage groups...")
    logger.info(f"Clustering with minimum cluster size {min_clust_size}...")
    piv_assigned = lineage_utils.assign_lineage_groups(
        piv,
        min_clust_size,
        min_intbc_thresh=min_intbc_thresh,
        kinship_thresh=kinship_thresh,
    )

    logger.info("Refining lineage groups...")
    logger.info(
        "Redefining lineage groups by removing low proportion intBCs...")
    master_LGs, master_intBCs = lineage_utils.filter_intbcs_lg_sets(
        piv_assigned, min_intbc_thresh=min_intbc_thresh)

    logger.info("Reassigning cells to refined lineage groups by kinship...")
    kinship_scores = lineage_utils.score_lineage_kinships(
        piv_assigned, master_LGs, master_intBCs)

    logger.info("Annotating alignment table with refined lineage groups...")
    allele_table = lineage_utils.annotate_lineage_groups(
        input_df, kinship_scores, master_intBCs)
    if inter_doublet_threshold:
        logger.info(
            f"Filtering out inter-lineage group doublets with proportion {inter_doublet_threshold}..."
        )
        allele_table = doublet_utils.filter_inter_doublets(
            allele_table, rule=inter_doublet_threshold)

    logger.info(
        "Filtering out low proportion intBCs in finalized lineage groups...")
    filtered_lgs = lineage_utils.filter_intbcs_final_lineages(
        allele_table, min_intbc_thresh=min_intbc_thresh)

    allele_table = lineage_utils.filtered_lineage_group_to_allele_table(
        filtered_lgs)

    logger.debug("Final lineage group assignments:")
    for n, g in allele_table.groupby(["lineageGrp"]):
        logger.debug(f"LG {n}: " + str(len(g["cellBC"].unique())) + " cells")

    logger.info("Filtering out low UMI cell barcodes...")
    allele_table = utilities.filter_cells(
        allele_table,
        min_umi_per_cell=int(min_umi_per_cell),
        min_avg_reads_per_umi=min_avg_reads_per_umi,
    )
    allele_table["lineageGrp"] = allele_table["lineageGrp"].astype(int)

    if plot:
        logger.info("Producing Plots...")
        at_pivot_I = pd.pivot_table(
            allele_table,
            index="cellBC",
            columns="intBC",
            values="UMI",
            aggfunc="count",
        )
        at_pivot_I.fillna(value=0, inplace=True)
        at_pivot_I[at_pivot_I > 0] = 1

        logger.info("Producing pivot table heatmap...")
        lineage_utils.plot_overlap_heatmap(allele_table, at_pivot_I,
                                           output_directory)

        logger.info("Plotting filtered lineage group pivot table heatmap...")
        lineage_utils.plot_overlap_heatmap_lg(allele_table, at_pivot_I,
                                              output_directory)

    return allele_table
Exemplo n.º 7
0
def find_top_lg(
    PIVOT_in: pd.DataFrame,
    iteration: int,
    min_intbc_prop: float = 0.2,
    kinship_thresh: float = 0.2,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Algorithm to creates lineage groups from a pivot table of UMI counts
    for each cellBC-intBC pair.

    First, identifies the most frequent intBC. Then, selects all intBCs that
    share a proportion of cells >= min_intbc_prop with the most frequent and
    defines that as the cluster set. Then finds all cells that have >=
    kinship_thresh intBCs that are in the cluster set and include them in the
    cluster. Finally outputs the cluster as the lineage group.

    Args:
        pivot_in: The input pivot table of UMI counts for each cellBC-intBC pair
        iteration: The cluster number and iteration number of the iterative
            wrapper function
        min_intbc_thresh: In order for an intBC to be included in the cluster
            set, it must have more than this proportion of cells shared with
            the most frequent intBC
        kinship_thresh: Determines the proportion of intBCs that a cell needs
            to share with the cluster in order to included in that cluster

    Returns:
        A pivot table of cells labled with lineage group assignments, and a
            pivot table of the remaining unassigned cells
    """

    # Calculate sum of observed intBCs, identify top intBC
    intBC_sums = PIVOT_in.sum(0).sort_values(ascending=False)
    intBC_top = intBC_sums.index[0]

    # Take subset of PIVOT table that contain cells that have the top intBC
    subPIVOT_in = PIVOT_in[PIVOT_in[intBC_top] > 0]
    subPIVOT_in_sums = subPIVOT_in.sum(0)
    ordered_intBCs2 = subPIVOT_in_sums.sort_values(
        ascending=False).index.tolist()
    subPIVOT_in = subPIVOT_in[ordered_intBCs2]

    # Binarize
    subPIVOT_in[subPIVOT_in > 0] = 1

    # Define intBC set
    subPIVOT_in_sums2 = subPIVOT_in.sum(0)
    total = subPIVOT_in_sums2[intBC_top]
    intBC_sums_filt = subPIVOT_in_sums2[subPIVOT_in_sums2 >= min_intbc_prop *
                                        total]

    # Reduce PIV to only intBCs considered in set
    intBC_set = intBC_sums_filt.index.tolist()
    PIV_set = PIVOT_in.iloc[:, PIVOT_in.columns.isin(intBC_set)]

    # Calculate fraction of UMIs within intBC_set ("kinship") for each cell
    # in PIV_set
    f_inset = PIV_set.sum(axis=1)

    # define set of cells with good kinship
    f_inset_filt = f_inset[f_inset >= kinship_thresh]
    LG_cells = f_inset_filt.index.tolist()

    # Return updated PIV with LG_cells removed
    PIV_noLG = PIVOT_in.iloc[~PIVOT_in.index.isin(LG_cells), :]

    # Return PIV with LG_cells assigned
    PIV_LG = PIVOT_in.iloc[PIVOT_in.index.isin(LG_cells), :].copy()
    PIV_LG["lineageGrp"] = iteration + 1

    # Print statements
    logger.debug(
        f"LG {iteration+1} Assignment: {PIV_LG.shape[0]} cells assigned")

    return PIV_LG, PIV_noLG