示例#1
0
def test_read_10x_csv():
    anndata = read_10x_vdj(TESTDATA / "10x/filtered_contig_annotations.csv")
    obs = anndata.obs
    assert obs.shape[0] == 5
    cell1 = obs.iloc[1, :]
    cell2 = obs.iloc[3, :]
    cell3 = obs.iloc[4, :]

    assert cell1.name == "AAACCTGAGTACGCCC-1"
    assert cell1["IR_VDJ_1_junction_aa"] == "CASSLGPSTDTQYF"
    assert cell1[
        "IR_VDJ_1_junction"] == "TGTGCCAGCAGCTTGGGACCTAGCACAGATACGCAGTATTTT"
    assert cell1["IR_VDJ_1_duplicate_count"] == 55
    assert cell1["IR_VDJ_1_consensus_count"] == 18021
    assert cell1["IR_VDJ_1_v_call"] == "TRBV7-2"
    assert cell1["IR_VDJ_1_d_call"] == "TRBD2"
    assert cell1["IR_VDJ_1_j_call"] == "TRBJ2-3"
    assert cell1["IR_VDJ_1_c_call"] == "TRBC2"
    assert _is_false(cell1["multi_chain"])
    assert cell1["IR_VJ_1_locus"] == "TRA"
    assert cell1["IR_VDJ_1_locus"] == "TRB"

    assert cell2.name == "AAACCTGGTCCGTTAA-1"
    assert cell2["IR_VJ_1_junction_aa"] == "CALNTGGFKTIF"
    assert cell2["IR_VJ_2_junction_aa"] == "CAVILDARLMF"
    assert cell2["IR_VJ_1_duplicate_count"] == 5
    assert cell2["IR_VJ_2_duplicate_count"] == 5
    assert cell2["IR_VJ_1_locus"] == "TRA"
    assert cell2["IR_VDJ_1_locus"] == "TRB"
    assert cell2["IR_VJ_2_locus"] == "TRA"
    assert _is_na(cell2["IR_VDJ_2_junction_aa"])

    assert cell3.name == "AAACTTGGTCCGTTAA-1"
    assert cell3["IR_VJ_1_locus"] == "IGK"
    assert cell3["IR_VDJ_1_locus"] == "IGH"
示例#2
0
def test_read_10x():
    anndata = read_10x_vdj("tests/data/10x/all_contig_annotations.json")
    obs = anndata.obs
    # this has `is_cell=false` and should be filtered out
    assert "AAACCTGAGACCTTTG-1" not in anndata.obs_names
    assert obs.shape[0] == 2
    cell1 = obs.iloc[0, :]
    cell2 = obs.iloc[1, :]

    assert cell1.name == "AAACCTGAGACCTTTG-2"
    assert cell1["TRB_1_cdr3"] == "CASSPPSQGLSTGELFF"
    assert (
        cell1["TRB_1_cdr3_nt"] == "TGTGCCAGCTCACCACCGAGCCAGGGCCTTTCTACCGGGGAGCTGTTTTTT"
    )
    assert cell1["TRB_1_junction_ins"] == 4 + 7
    assert cell1["TRB_1_expr"] == 1
    assert cell1["TRB_1_v_gene"] == "TRBV18"
    assert cell1["TRB_1_d_gene"] == "TRBD1"
    assert cell1["TRB_1_j_gene"] == "TRBJ2-2"
    assert cell1["TRB_1_c_gene"] == "TRBC2"
    assert _is_false(cell1["multi_chain"])
    assert np.all(_is_na(cell1[["TRA_1_cdr3", "TRB_2_cdr3", "TRA_1_junction_ins"]]))

    assert cell2.name == "AAACCTGAGTACGCCC-1"
    assert cell2["TRA_1_cdr3"] == "CAMRVGGSQGNLIF"
    assert cell2["TRA_2_cdr3"] == "CATDAKDSNYQLIW"
    assert cell2["TRA_1_expr"] == 9
    assert cell2["TRA_2_expr"] == 4
    assert np.all(_is_na(cell2[["TRB_1_cdr3", "TRB_2_cdr3"]]))
    assert cell2["TRA_1_junction_ins"] == 4
    assert cell2["TRA_2_junction_ins"] == 4
示例#3
0
def test_read_10x_example():
    """Test that a full 10x CSV table can be imported without errors.

    Test-dataset from https://support.10xgenomics.com/single-cell-vdj/datasets/3.1.0/vdj_nextgem_hs_pbmc3
    under CC-BY-4.0
    """
    anndata = read_10x_vdj(
        "tests/data/10x/vdj_nextgem_hs_pbmc3_t_filtered_contig_annotations.csv.gz"
    )
示例#4
0
def test_read_10x_json_cr6():
    """Test additional cols from CR6 outputs: fwr{1,2,3,4}{,_nt} and cdr{1,2}{,_nt}"""

    anndata = read_10x_vdj(
        TESTDATA /
        "10x/10k_BMMNC_5pv2_nextgem_Multiplex_vdj_t_all_contig_annotations_small.json",
        include_fields=None,
    )
    obs = anndata.obs
    assert obs.shape[0] == 2
    cell1 = obs.iloc[0, :]

    assert cell1.name == "AAACCTGCACAGGTTT-1"
    assert cell1["IR_VDJ_1_fwr1_aa"] == "KAGVTQTPRYLIKTRGQQVTLSCSPI"
    assert (
        cell1["IR_VDJ_1_fwr1"] ==
        "AAGGCTGGAGTCACTCAAACTCCAAGATATCTGATCAAAACGAGAGGACAGCAAGTGACACTGAGCTGCTCCCCTATC"
    )
    assert cell1["IR_VDJ_1_cdr1_aa"] == "SGHRS"
    assert cell1["IR_VDJ_1_cdr1"] == "TCTGGGCATAGGAGT"
    assert cell1["IR_VDJ_1_fwr2_aa"] == "VSWYQQTPGQGLQFLFE"
    assert (cell1["IR_VDJ_1_fwr2"] ==
            "GTATCCTGGTACCAACAGACCCCAGGACAGGGCCTTCAGTTCCTCTTTGAA")
    assert cell1["IR_VDJ_1_cdr2_aa"] == "YFSETQ"
    assert cell1["IR_VDJ_1_cdr2"] == "TACTTCAGTGAGACACAG"
    assert cell1["IR_VDJ_1_fwr3_aa"] == "RNKGNFPGRFSGRQFSNSRSEMNVSTLELGDSALYL"
    assert (
        cell1["IR_VDJ_1_fwr3"] ==
        "AGAAACAAAGGAAACTTCCCTGGTCGATTCTCAGGGCGCCAGTTCTCTAACTCTCGCTCTGAGATGAATGTGAGCACCTTGGAGCTGGGGGACTCGGCCCTTTATCTT"
    )
    assert cell1["IR_VDJ_1_cdr3_aa"] == "ASSWMDRGEAF"
    assert cell1["IR_VDJ_1_cdr3"] == "GCCAGCAGCTGGATGGATAGGGGTGAAGCTTTC"
    assert cell1["IR_VDJ_1_fwr4_aa"] == "GQGTRLTVV"
    assert cell1["IR_VDJ_1_fwr4"] == "GGACAAGGCACCAGACTCACAGTTGTAG"

    assert cell1["IR_VJ_1_fwr1_aa"] == "AQTVTQSQPEMSVQEAETVTLSCTYD"
    assert (
        cell1["IR_VJ_1_fwr1"] ==
        "GCTCAGACAGTCACTCAGTCTCAACCAGAGATGTCTGTGCAGGAGGCAGAGACCGTGACCCTGAGCTGCACATATGAC"
    )
    assert cell1["IR_VJ_1_cdr1_aa"] == "TSESDYY"
    assert cell1["IR_VJ_1_cdr1"] == "ACCAGTGAGAGTGATTATTAT"
    assert cell1["IR_VJ_1_fwr2_aa"] == "LFWYKQPPSRQMILVIR"
    assert (cell1["IR_VJ_1_fwr2"] ==
            "TTATTCTGGTACAAGCAGCCTCCCAGCAGGCAGATGATTCTCGTTATTCGC")
    assert cell1["IR_VJ_1_cdr2_aa"] == "QEAYKQQN"
    assert cell1["IR_VJ_1_cdr2"] == "CAAGAAGCTTATAAGCAACAGAAT"
    assert cell1["IR_VJ_1_fwr3_aa"] == "ATENRFSVNFQKAAKSFSLKISDSQLGDAAMYF"
    assert (
        cell1["IR_VJ_1_fwr3"] ==
        "GCAACAGAGAATCGTTTCTCTGTGAACTTCCAGAAAGCAGCCAAATCCTTCAGTCTCAAGATCTCAGACTCACAGCTGGGGGATGCCGCGATGTATTTC"
    )
    assert cell1["IR_VJ_1_cdr3_aa"] == "ALYKVTGNQFY"
    assert cell1["IR_VJ_1_cdr3"] == "GCTCTTTATAAGGTCACCGGTAACCAGTTCTAT"
    assert cell1["IR_VJ_1_fwr4_aa"] == "GTGTSLTVIP"
    assert cell1["IR_VJ_1_fwr4"] == "GGGACAGGGACAAGTTTGACGGTCATTCCAA"
示例#5
0
def _read_anndata_from_10x_sample(path):
    """Read full 10x CSV table and convert it to IR objects, ready
    to be used for roundtrip conversions.

    Test-dataset from https://support.10xgenomics.com/single-cell-vdj/datasets/3.1.0/vdj_nextgem_hs_pbmc3
    and https://support.10xgenomics.com/single-cell-vdj/datasets/4.0.0/sc5p_v2_hs_melanoma_10k
    under CC-BY-4.0.

    Pytest only caches one fixture at a time, i.e. it doesn't work with parametrized
    fixtures. Therefore, we use the lru_cache instead.
    """
    print(f"Reading 10x file: {path}")
    anndata = read_10x_vdj(path, include_fields=None)
    return anndata
示例#6
0
def test_read_10x():
    anndata = read_10x_vdj(TESTDATA / "10x/all_contig_annotations.json",
                           include_fields=None)
    obs = anndata.obs
    # this has `is_cell=false` and should be filtered out
    assert "AAACCTGAGACCTTTG-1" not in anndata.obs_names
    assert obs.shape[0] == 3
    cell1 = obs.iloc[0, :]
    cell2 = obs.iloc[1, :]
    cell3 = obs.iloc[2, :]

    assert cell1.name == "AAACCTGAGACCTTTG-2"
    assert cell1["IR_VDJ_1_junction_aa"] == "CASSPPSQGLSTGELFF"
    assert (cell1["IR_VDJ_1_junction"] ==
            "TGTGCCAGCTCACCACCGAGCCAGGGCCTTTCTACCGGGGAGCTGTTTTTT")
    assert cell1["IR_VDJ_1_np1_length"] == 4
    assert cell1["IR_VDJ_1_np2_length"] == 7
    assert cell1["IR_VDJ_1_duplicate_count"] == 1
    assert cell1["IR_VDJ_1_consensus_count"] == 494
    assert cell1["IR_VDJ_1_v_call"] == "TRBV18"
    assert cell1["IR_VDJ_1_d_call"] == "TRBD1"
    assert cell1["IR_VDJ_1_j_call"] == "TRBJ2-2"
    assert cell1["IR_VDJ_1_c_call"] == "TRBC2"
    assert _is_false(cell1["multi_chain"])
    assert np.all(
        _is_na(cell1[[
            "IR_VJ_1_junction_aa", "IR_VDJ_2_junction_aa", "IR_VJ_1_np1_length"
        ]]))

    assert cell2.name == "AAACCTGAGTACGCCC-1"
    assert cell2["IR_VJ_1_junction_aa"] == "CAMRVGGSQGNLIF"
    assert cell2["IR_VJ_2_junction_aa"] == "CATDAKDSNYQLIW"
    assert cell2["IR_VJ_1_duplicate_count"] == 9
    assert cell2["IR_VJ_2_duplicate_count"] == 4
    assert np.all(
        _is_na(cell2[["IR_VDJ_1_junction_aa", "IR_VDJ_2_junction_aa"]]))
    assert cell2["IR_VJ_1_np1_length"] == 4
    assert _is_na(cell2["IR_VJ_1_np2_length"])
    assert cell2["IR_VJ_2_np1_length"] == 4
    assert _is_na(cell2["IR_VJ_2_np2_length"])

    assert cell3.name == "CAGGTGCTCGTGGTCG-1"
    assert cell3["IR_VJ_1_locus"] == "IGK"
    assert _is_na(cell3["IR_VJ_2_locus"])  # non-productive
    assert cell3["IR_VDJ_1_locus"] == "IGH"
    assert _is_na(cell3["IR_VDJ_2_locus"])  # non-productive
示例#7
0
def test_read_and_convert_10x_example(path):
    """Test that a full 10x CSV table can be imported without errors.

    Additionally test that the round-trip conversion using `to_ir_objs` and
    `from_ir_objs` is the identity. Doing this here to avoid loading the data twice
    since this is already one of the longer-running tests.

    Test-dataset from https://support.10xgenomics.com/single-cell-vdj/datasets/3.1.0/vdj_nextgem_hs_pbmc3
    and https://support.10xgenomics.com/single-cell-vdj/datasets/4.0.0/sc5p_v2_hs_melanoma_10k
    under CC-BY-4.0
    """
    anndata = read_10x_vdj(path)
    assert anndata.shape[0] > 0

    # Test that round-trip conversion succeeds
    ir_objs = to_ir_objs(anndata)
    anndata2 = from_ir_objs(ir_objs)
    pdt.assert_frame_equal(anndata.obs, anndata2.obs)
示例#8
0
def test_read_10x_csv():
    anndata = read_10x_vdj("tests/data/10x/filtered_contig_annotations.csv")
    obs = anndata.obs
    assert obs.shape[0] == 4
    cell1 = obs.iloc[1, :]
    cell2 = obs.iloc[3, :]

    assert cell1.name == "AAACCTGAGTACGCCC-1"
    assert cell1["TRB_1_cdr3"] == "CASSLGPSTDTQYF"
    assert cell1["TRB_1_cdr3_nt"] == "TGTGCCAGCAGCTTGGGACCTAGCACAGATACGCAGTATTTT"
    assert _is_na(cell1["TRB_1_junction_ins"])
    assert cell1["TRB_1_expr"] == 55
    assert cell1["TRB_1_v_gene"] == "TRBV7-2"
    assert cell1["TRB_1_d_gene"] == "TRBD2"
    assert cell1["TRB_1_j_gene"] == "TRBJ2-3"
    assert cell1["TRB_1_c_gene"] == "TRBC2"
    assert _is_false(cell1["multi_chain"])

    assert cell2.name == "AAACCTGGTCCGTTAA-1"
    assert cell2["TRA_1_cdr3"] == "CALNTGGFKTIF"
    assert cell2["TRA_2_cdr3"] == "CAVILDARLMF"
    assert cell2["TRA_1_expr"] == 5
    assert cell2["TRA_2_expr"] == 5
    assert _is_na(cell2["TRB_2_cdr3"])