def test_read_10x_csv(): anndata = read_10x_vdj(TESTDATA / "10x/filtered_contig_annotations.csv") obs = anndata.obs assert obs.shape[0] == 5 cell1 = obs.iloc[1, :] cell2 = obs.iloc[3, :] cell3 = obs.iloc[4, :] assert cell1.name == "AAACCTGAGTACGCCC-1" assert cell1["IR_VDJ_1_junction_aa"] == "CASSLGPSTDTQYF" assert cell1[ "IR_VDJ_1_junction"] == "TGTGCCAGCAGCTTGGGACCTAGCACAGATACGCAGTATTTT" assert cell1["IR_VDJ_1_duplicate_count"] == 55 assert cell1["IR_VDJ_1_consensus_count"] == 18021 assert cell1["IR_VDJ_1_v_call"] == "TRBV7-2" assert cell1["IR_VDJ_1_d_call"] == "TRBD2" assert cell1["IR_VDJ_1_j_call"] == "TRBJ2-3" assert cell1["IR_VDJ_1_c_call"] == "TRBC2" assert _is_false(cell1["multi_chain"]) assert cell1["IR_VJ_1_locus"] == "TRA" assert cell1["IR_VDJ_1_locus"] == "TRB" assert cell2.name == "AAACCTGGTCCGTTAA-1" assert cell2["IR_VJ_1_junction_aa"] == "CALNTGGFKTIF" assert cell2["IR_VJ_2_junction_aa"] == "CAVILDARLMF" assert cell2["IR_VJ_1_duplicate_count"] == 5 assert cell2["IR_VJ_2_duplicate_count"] == 5 assert cell2["IR_VJ_1_locus"] == "TRA" assert cell2["IR_VDJ_1_locus"] == "TRB" assert cell2["IR_VJ_2_locus"] == "TRA" assert _is_na(cell2["IR_VDJ_2_junction_aa"]) assert cell3.name == "AAACTTGGTCCGTTAA-1" assert cell3["IR_VJ_1_locus"] == "IGK" assert cell3["IR_VDJ_1_locus"] == "IGH"
def test_read_10x(): anndata = read_10x_vdj("tests/data/10x/all_contig_annotations.json") obs = anndata.obs # this has `is_cell=false` and should be filtered out assert "AAACCTGAGACCTTTG-1" not in anndata.obs_names assert obs.shape[0] == 2 cell1 = obs.iloc[0, :] cell2 = obs.iloc[1, :] assert cell1.name == "AAACCTGAGACCTTTG-2" assert cell1["TRB_1_cdr3"] == "CASSPPSQGLSTGELFF" assert ( cell1["TRB_1_cdr3_nt"] == "TGTGCCAGCTCACCACCGAGCCAGGGCCTTTCTACCGGGGAGCTGTTTTTT" ) assert cell1["TRB_1_junction_ins"] == 4 + 7 assert cell1["TRB_1_expr"] == 1 assert cell1["TRB_1_v_gene"] == "TRBV18" assert cell1["TRB_1_d_gene"] == "TRBD1" assert cell1["TRB_1_j_gene"] == "TRBJ2-2" assert cell1["TRB_1_c_gene"] == "TRBC2" assert _is_false(cell1["multi_chain"]) assert np.all(_is_na(cell1[["TRA_1_cdr3", "TRB_2_cdr3", "TRA_1_junction_ins"]])) assert cell2.name == "AAACCTGAGTACGCCC-1" assert cell2["TRA_1_cdr3"] == "CAMRVGGSQGNLIF" assert cell2["TRA_2_cdr3"] == "CATDAKDSNYQLIW" assert cell2["TRA_1_expr"] == 9 assert cell2["TRA_2_expr"] == 4 assert np.all(_is_na(cell2[["TRB_1_cdr3", "TRB_2_cdr3"]])) assert cell2["TRA_1_junction_ins"] == 4 assert cell2["TRA_2_junction_ins"] == 4
def test_read_10x_example(): """Test that a full 10x CSV table can be imported without errors. Test-dataset from https://support.10xgenomics.com/single-cell-vdj/datasets/3.1.0/vdj_nextgem_hs_pbmc3 under CC-BY-4.0 """ anndata = read_10x_vdj( "tests/data/10x/vdj_nextgem_hs_pbmc3_t_filtered_contig_annotations.csv.gz" )
def test_read_10x_json_cr6(): """Test additional cols from CR6 outputs: fwr{1,2,3,4}{,_nt} and cdr{1,2}{,_nt}""" anndata = read_10x_vdj( TESTDATA / "10x/10k_BMMNC_5pv2_nextgem_Multiplex_vdj_t_all_contig_annotations_small.json", include_fields=None, ) obs = anndata.obs assert obs.shape[0] == 2 cell1 = obs.iloc[0, :] assert cell1.name == "AAACCTGCACAGGTTT-1" assert cell1["IR_VDJ_1_fwr1_aa"] == "KAGVTQTPRYLIKTRGQQVTLSCSPI" assert ( cell1["IR_VDJ_1_fwr1"] == "AAGGCTGGAGTCACTCAAACTCCAAGATATCTGATCAAAACGAGAGGACAGCAAGTGACACTGAGCTGCTCCCCTATC" ) assert cell1["IR_VDJ_1_cdr1_aa"] == "SGHRS" assert cell1["IR_VDJ_1_cdr1"] == "TCTGGGCATAGGAGT" assert cell1["IR_VDJ_1_fwr2_aa"] == "VSWYQQTPGQGLQFLFE" assert (cell1["IR_VDJ_1_fwr2"] == "GTATCCTGGTACCAACAGACCCCAGGACAGGGCCTTCAGTTCCTCTTTGAA") assert cell1["IR_VDJ_1_cdr2_aa"] == "YFSETQ" assert cell1["IR_VDJ_1_cdr2"] == "TACTTCAGTGAGACACAG" assert cell1["IR_VDJ_1_fwr3_aa"] == "RNKGNFPGRFSGRQFSNSRSEMNVSTLELGDSALYL" assert ( cell1["IR_VDJ_1_fwr3"] == "AGAAACAAAGGAAACTTCCCTGGTCGATTCTCAGGGCGCCAGTTCTCTAACTCTCGCTCTGAGATGAATGTGAGCACCTTGGAGCTGGGGGACTCGGCCCTTTATCTT" ) assert cell1["IR_VDJ_1_cdr3_aa"] == "ASSWMDRGEAF" assert cell1["IR_VDJ_1_cdr3"] == "GCCAGCAGCTGGATGGATAGGGGTGAAGCTTTC" assert cell1["IR_VDJ_1_fwr4_aa"] == "GQGTRLTVV" assert cell1["IR_VDJ_1_fwr4"] == "GGACAAGGCACCAGACTCACAGTTGTAG" assert cell1["IR_VJ_1_fwr1_aa"] == "AQTVTQSQPEMSVQEAETVTLSCTYD" assert ( cell1["IR_VJ_1_fwr1"] == "GCTCAGACAGTCACTCAGTCTCAACCAGAGATGTCTGTGCAGGAGGCAGAGACCGTGACCCTGAGCTGCACATATGAC" ) assert cell1["IR_VJ_1_cdr1_aa"] == "TSESDYY" assert cell1["IR_VJ_1_cdr1"] == "ACCAGTGAGAGTGATTATTAT" assert cell1["IR_VJ_1_fwr2_aa"] == "LFWYKQPPSRQMILVIR" assert (cell1["IR_VJ_1_fwr2"] == "TTATTCTGGTACAAGCAGCCTCCCAGCAGGCAGATGATTCTCGTTATTCGC") assert cell1["IR_VJ_1_cdr2_aa"] == "QEAYKQQN" assert cell1["IR_VJ_1_cdr2"] == "CAAGAAGCTTATAAGCAACAGAAT" assert cell1["IR_VJ_1_fwr3_aa"] == "ATENRFSVNFQKAAKSFSLKISDSQLGDAAMYF" assert ( cell1["IR_VJ_1_fwr3"] == "GCAACAGAGAATCGTTTCTCTGTGAACTTCCAGAAAGCAGCCAAATCCTTCAGTCTCAAGATCTCAGACTCACAGCTGGGGGATGCCGCGATGTATTTC" ) assert cell1["IR_VJ_1_cdr3_aa"] == "ALYKVTGNQFY" assert cell1["IR_VJ_1_cdr3"] == "GCTCTTTATAAGGTCACCGGTAACCAGTTCTAT" assert cell1["IR_VJ_1_fwr4_aa"] == "GTGTSLTVIP" assert cell1["IR_VJ_1_fwr4"] == "GGGACAGGGACAAGTTTGACGGTCATTCCAA"
def _read_anndata_from_10x_sample(path): """Read full 10x CSV table and convert it to IR objects, ready to be used for roundtrip conversions. Test-dataset from https://support.10xgenomics.com/single-cell-vdj/datasets/3.1.0/vdj_nextgem_hs_pbmc3 and https://support.10xgenomics.com/single-cell-vdj/datasets/4.0.0/sc5p_v2_hs_melanoma_10k under CC-BY-4.0. Pytest only caches one fixture at a time, i.e. it doesn't work with parametrized fixtures. Therefore, we use the lru_cache instead. """ print(f"Reading 10x file: {path}") anndata = read_10x_vdj(path, include_fields=None) return anndata
def test_read_10x(): anndata = read_10x_vdj(TESTDATA / "10x/all_contig_annotations.json", include_fields=None) obs = anndata.obs # this has `is_cell=false` and should be filtered out assert "AAACCTGAGACCTTTG-1" not in anndata.obs_names assert obs.shape[0] == 3 cell1 = obs.iloc[0, :] cell2 = obs.iloc[1, :] cell3 = obs.iloc[2, :] assert cell1.name == "AAACCTGAGACCTTTG-2" assert cell1["IR_VDJ_1_junction_aa"] == "CASSPPSQGLSTGELFF" assert (cell1["IR_VDJ_1_junction"] == "TGTGCCAGCTCACCACCGAGCCAGGGCCTTTCTACCGGGGAGCTGTTTTTT") assert cell1["IR_VDJ_1_np1_length"] == 4 assert cell1["IR_VDJ_1_np2_length"] == 7 assert cell1["IR_VDJ_1_duplicate_count"] == 1 assert cell1["IR_VDJ_1_consensus_count"] == 494 assert cell1["IR_VDJ_1_v_call"] == "TRBV18" assert cell1["IR_VDJ_1_d_call"] == "TRBD1" assert cell1["IR_VDJ_1_j_call"] == "TRBJ2-2" assert cell1["IR_VDJ_1_c_call"] == "TRBC2" assert _is_false(cell1["multi_chain"]) assert np.all( _is_na(cell1[[ "IR_VJ_1_junction_aa", "IR_VDJ_2_junction_aa", "IR_VJ_1_np1_length" ]])) assert cell2.name == "AAACCTGAGTACGCCC-1" assert cell2["IR_VJ_1_junction_aa"] == "CAMRVGGSQGNLIF" assert cell2["IR_VJ_2_junction_aa"] == "CATDAKDSNYQLIW" assert cell2["IR_VJ_1_duplicate_count"] == 9 assert cell2["IR_VJ_2_duplicate_count"] == 4 assert np.all( _is_na(cell2[["IR_VDJ_1_junction_aa", "IR_VDJ_2_junction_aa"]])) assert cell2["IR_VJ_1_np1_length"] == 4 assert _is_na(cell2["IR_VJ_1_np2_length"]) assert cell2["IR_VJ_2_np1_length"] == 4 assert _is_na(cell2["IR_VJ_2_np2_length"]) assert cell3.name == "CAGGTGCTCGTGGTCG-1" assert cell3["IR_VJ_1_locus"] == "IGK" assert _is_na(cell3["IR_VJ_2_locus"]) # non-productive assert cell3["IR_VDJ_1_locus"] == "IGH" assert _is_na(cell3["IR_VDJ_2_locus"]) # non-productive
def test_read_and_convert_10x_example(path): """Test that a full 10x CSV table can be imported without errors. Additionally test that the round-trip conversion using `to_ir_objs` and `from_ir_objs` is the identity. Doing this here to avoid loading the data twice since this is already one of the longer-running tests. Test-dataset from https://support.10xgenomics.com/single-cell-vdj/datasets/3.1.0/vdj_nextgem_hs_pbmc3 and https://support.10xgenomics.com/single-cell-vdj/datasets/4.0.0/sc5p_v2_hs_melanoma_10k under CC-BY-4.0 """ anndata = read_10x_vdj(path) assert anndata.shape[0] > 0 # Test that round-trip conversion succeeds ir_objs = to_ir_objs(anndata) anndata2 = from_ir_objs(ir_objs) pdt.assert_frame_equal(anndata.obs, anndata2.obs)
def test_read_10x_csv(): anndata = read_10x_vdj("tests/data/10x/filtered_contig_annotations.csv") obs = anndata.obs assert obs.shape[0] == 4 cell1 = obs.iloc[1, :] cell2 = obs.iloc[3, :] assert cell1.name == "AAACCTGAGTACGCCC-1" assert cell1["TRB_1_cdr3"] == "CASSLGPSTDTQYF" assert cell1["TRB_1_cdr3_nt"] == "TGTGCCAGCAGCTTGGGACCTAGCACAGATACGCAGTATTTT" assert _is_na(cell1["TRB_1_junction_ins"]) assert cell1["TRB_1_expr"] == 55 assert cell1["TRB_1_v_gene"] == "TRBV7-2" assert cell1["TRB_1_d_gene"] == "TRBD2" assert cell1["TRB_1_j_gene"] == "TRBJ2-3" assert cell1["TRB_1_c_gene"] == "TRBC2" assert _is_false(cell1["multi_chain"]) assert cell2.name == "AAACCTGGTCCGTTAA-1" assert cell2["TRA_1_cdr3"] == "CALNTGGFKTIF" assert cell2["TRA_2_cdr3"] == "CAVILDARLMF" assert cell2["TRA_1_expr"] == 5 assert cell2["TRA_2_expr"] == 5 assert _is_na(cell2["TRB_2_cdr3"])