def test_update_plus(airr_reannotated): vdj = ddl.Dandelion(airr_reannotated) vdj.update_plus() assert 'mu_count' in vdj.metadata vdj.update_plus(option='sequence') assert 'sequence_VDJ' in vdj.metadata vdj.update_plus(option='cdr3 lengths') assert 'junction_aa_length_VDJ' in vdj.metadata vdj = ddl.Dandelion(airr_reannotated) vdj.update_plus(option='mutations') assert 'mu_count' in vdj.metadata vdj.update_plus(option='all') assert 'sequence_VDJ' in vdj.metadata vdj.update_plus(option='cdr3 lengths')
def test_travdv_filter(create_testfolder, dummy_adata_travdv): vdj = ddl.Dandelion( str(create_testfolder) + '/dandelion/filtered_contig_dandelion.tsv') assert vdj.data.shape[0] == 23 assert len([i for i in vdj.data['locus'] if i == 'TRD']) == 0 vdj2, adata = ddl.pp.filter_contigs(vdj, dummy_adata_travdv) assert vdj2.data.shape[0] == 8
def to_dandelion(adata: AnnData): """Export data to `Dandelion <https://github.com/zktuong/dandelion>`_ (:cite:`Stephenson2021`). Parameters ---------- adata annotated data matrix with :term:`IR` annotations. Returns ------- `Dandelion` object. """ try: import dandelion as ddl except: raise ImportError("Please install dandelion: pip install sc-dandelion.") airr_cells = to_airr_cells(adata) contig_dicts = {} for tmp_cell in airr_cells: for i, chain in enumerate(tmp_cell.to_airr_records(), start=1): # dandelion-specific modifications chain.update( { "sequence_id": f"{tmp_cell.cell_id}_contig_{i}", } ) contig_dicts[chain["sequence_id"]] = chain data = pd.DataFrame.from_dict(contig_dicts, orient="index") return ddl.Dandelion(ddl.load_data(data))
def test_mutation(create_testfolder, airr_reannotated): f = create_testfolder / "test.tsv" airr_reannotated.to_csv(f, sep='\t', index=False) ddl.pp.quantify_mutations(f) out = pd.read_csv(f, sep='\t') vdj = ddl.Dandelion(out) assert not vdj.data.mu_count.empty ddl.pp.quantify_mutations(f, frequency=True) assert not vdj.data.mu_freq.empty
def test_manual_threshold_and_define_clones(create_testfolder): f = create_testfolder / "test.tsv" out = pd.read_csv(f, sep='\t') vdj = ddl.Dandelion(out) vdj.threshold = 0.1 ddl.tl.define_clones(vdj) assert not vdj.data.clone_id.empty ddl.tl.define_clones(vdj, key_added='changeo_clone') assert not vdj.data.changeo_clone.empty
def test_quantify_mut_2(create_testfolder, processed_files, freq, colname): f = create_testfolder / str('dandelion/' + processed_files['filtered']) vdj = ddl.Dandelion(f) ddl.pp.quantify_mutations(vdj, frequency=freq) assert not vdj.data[colname].empty if colname == 'mu_freq': assert vdj.data[colname].dtype == float else: assert vdj.data[colname].dtype == int
def test_setup(create_testfolder, airr_reannotated, dummy_adata): vdj, adata = ddl.pp.filter_contigs(airr_reannotated, dummy_adata) assert airr_reannotated.shape[0] == 9 assert vdj.data.shape[0] == 7 assert vdj.metadata.shape[0] == 4 assert adata.n_obs == 5 vdj.data['clone_id'] = ['A', 'A', 'A', 'A', 'A', 'A', 'A'] vdj = ddl.Dandelion(vdj.data) ddl.tl.generate_network(vdj) ddl.tl.transfer(adata, vdj) assert 'clone_id' in adata.obs assert 'X_vdj' in adata.obsm f1 = create_testfolder / "test.h5" f2 = create_testfolder / "test.h5ad" vdj.write_h5(f1) adata.write_h5ad(f2)
def test_container(): os.system( "cd /tests; python /share/dandelion_preprocess.py --meta test.csv;") dat = pd.read_csv( '/tests/sample_test_10x/dandelion/filtered_contig_dandelion.tsv', sep='\t') assert not dat['c_call'].empty assert not dat['v_call_genotyped'].empty assert not dat['mu_count'].empty assert not dat['mu_freq'].empty vdj = None try: vdj = ddl.Dandelion(dat) except: pass assert vdj is not None
def test_generate_network(create_testfolder, resample, expected): f = create_testfolder / "test.h5" vdj = ddl.read_h5(f) if resample is not None: vdj = ddl.tl.generate_network(vdj, downsample=resample) else: ddl.tl.generate_network(vdj) assert vdj.distance is not None assert vdj.edges is None assert vdj.n_obs == expected assert vdj.layout is not None assert vdj.graph is not None vdj.data['clone_id'] = '1' vdj = ddl.Dandelion(vdj.data) assert vdj.data.clone_id.dtype == 'object' ddl.tl.generate_network(vdj) assert vdj.edges is not None
def test_update_germlines(create_testfolder, processed_files, database_paths, fasta_10x): f = create_testfolder / str('dandelion/' + processed_files['filtered']) vdj = ddl.Dandelion(f) vdj.update_germline(germline=database_paths['germline']) assert len(vdj.germline) > 0 out_file = str(create_testfolder) + "/test_airr_reannotated.h5" vdj.write_h5(out_file) tmp = ddl.read_h5(out_file) assert len(tmp.germline) > 0 vdj.update_germline(germline=database_paths['germline'], corrected=str(create_testfolder) + "/filtered_contig.fasta") assert len(vdj.germline) > 0 vdj.update_germline(germline=database_paths['germline'], corrected=fasta_10x) assert len(vdj.germline) > 0 with pytest.raises(TypeError): vdj.update_germline(germline=database_paths['germline'], corrected=[])
def test_setup(): file = "https://cf.10xgenomics.com/samples/cell-vdj/5.0.0/sc5p_v2_hs_B_1k_multi_5gex_b/sc5p_v2_hs_B_1k_multi_5gex_b_vdj_b_airr_rearrangement.tsv" r = requests.get(file) test_data = pd.read_csv(StringIO(r.text), sep="\t") test_data["locus"] = [ "IGH" if "IGH" in i else "IGK" if "IGK" in i else "IGL" if "IGL" in i else None for i in test_data.v_call ] test_data["umi_count"] = test_data["duplicate_count"] test_data["sample_id"] = "test" test_ddl = ddl.Dandelion(test_data) test_ddl.write_h5("tests/test.h5", compression="bzip2") test_ddl.write_pkl("tests/test.pkl.pbz2") test = ddl.read_h5("tests/test.h5") _ = ddl.read_pkl("tests/test.pkl.pbz2") scfile = "https://cf.10xgenomics.com/samples/cell-vdj/5.0.0/sc5p_v2_hs_B_1k_multi_5gex_b/sc5p_v2_hs_B_1k_multi_5gex_b_count_filtered_feature_bc_matrix.h5" r = requests.get(scfile) open("tests/sctest.h5", "wb").write(r.content) adata = sc.read_10x_h5("tests/sctest.h5") adata.write("tests/sctest.h5ad", compression="gzip") print(test) print(adata)
def test_readwrite_h5(create_testfolder): out_file1 = str(create_testfolder) + "/test_airr_reannotated.tsv" out_file2 = str(create_testfolder) + "/test_airr_reannotated.h5" vdj = ddl.Dandelion(out_file1) assert not vdj.data.np1_length.empty assert not vdj.data.np2_length.empty assert not vdj.data.junction_length.empty vdj.write_h5(out_file2) vdj2 = ddl.read_h5(out_file2) assert not vdj2.data.np1_length.empty assert not vdj2.data.np2_length.empty assert not vdj2.data.junction_length.empty vdj.write_h5(out_file2, complib='blosc:lz4') vdj2 = ddl.read_h5(out_file2) assert not vdj2.data.np1_length.empty assert not vdj2.data.np2_length.empty assert not vdj2.data.junction_length.empty vdj.write_h5(out_file2, compression='blosc:lz4') vdj2 = ddl.read_h5(out_file2) assert not vdj2.data.np1_length.empty assert not vdj2.data.np2_length.empty assert not vdj2.data.junction_length.empty with pytest.raises(ValueError): vdj.write_h5(out_file2, complib='blosc:lz4', compression='blosc:lz4')
def test_readwrite_pkl(create_testfolder): out_file1 = str(create_testfolder) + "/test_airr_reannotated.tsv" out_file2 = str(create_testfolder) + "/test_airr_reannotated.pkl" out_file3 = str(create_testfolder) + "/test_airr_reannotated.pkl.gz" out_file4 = str(create_testfolder) + "/test_airr_reannotated.pkl.pbz2" vdj = ddl.Dandelion(out_file1) assert not vdj.data.np1_length.empty assert not vdj.data.np2_length.empty assert not vdj.data.junction_length.empty vdj.write_pkl(out_file2) vdj3 = ddl.read_pkl(out_file2) assert not vdj3.data.np1_length.empty assert not vdj3.data.np2_length.empty assert not vdj3.data.junction_length.empty vdj.write_pkl(out_file3) vdj4 = ddl.read_pkl(out_file3) assert not vdj4.data.np1_length.empty assert not vdj4.data.np2_length.empty assert not vdj4.data.junction_length.empty vdj.write_pkl(out_file4) vdj5 = ddl.read_pkl(out_file4) assert not vdj5.data.np1_length.empty assert not vdj5.data.np2_length.empty assert not vdj5.data.junction_length.empty
def test_update_germlines(create_testfolder, processed_files, database_paths): f = create_testfolder / str('dandelion/' + processed_files['filtered']) vdj = ddl.Dandelion(f) vdj.update_germlines(database_paths['germline']) assert len(vdj.germline) > 0
def test_loadtravdv2(airr_travdv): vdj = ddl.Dandelion(airr_travdv) assert vdj.data.shape[0] == 6 assert all([i == 'TRD' for i in vdj.data['locus']])
def test_create_germlines(create_testfolder, database_paths): f = create_testfolder / "test.tsv" out = pd.read_csv(f, sep='\t') vdj = ddl.Dandelion(out) ddl.pp.create_germlines(vdj, germline=database_paths['germline']) assert not vdj.data.germline_alignment_d_mask.empty
def test_loadtravdv_reannotated(create_testfolder): vdj = ddl.Dandelion( str(create_testfolder) + '/dandelion/filtered_contig_dandelion.tsv') assert vdj.data.shape[0] == 23 assert len([i for i in vdj.data['locus'] if i == 'TRD']) == 0
def test_update_germlines_fail(create_testfolder, processed_files): f = create_testfolder / str('dandelion/' + processed_files['filtered']) vdj = ddl.Dandelion(f) with pytest.raises(KeyError): vdj.update_germline()