def test_generic_pandas_mapper(): """ Test that generic pandas mapping selects and renames columns """ df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) mapper = OrderedDict([('a', 'ardvark')]) r = mappers.generic_pandas_mapper(df, mapper) expected = pd.DataFrame({"ardvark": [1, 2]}) assert r.equals(expected)
def test_generic_pandas_mapper_raises_assertion_error(): """ Test that a KeyError is raised if the mapper contains keys not in df.columns """ df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) mapper = OrderedDict([('c', 'canary')]) with pytest.raises(KeyError): r = mappers.generic_pandas_mapper(df, mapper) expected = pd.DataFrame({"ardvark": [1, 2]})
def test_data_type_conversion_with_reduce_file_size(): """ Test that successfull conversion of np attributes from np.float64 to np.int16 after calling """ import pandas as pd import numpy as np import tcrdist as td from tcrdist import mappers from tcrdist.repertoire import TCRrep tcrdist_clone_fn = 'tcrdist/test_files/mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones.tsv' tcrdist_clone_df = pd.read_csv(tcrdist_clone_fn, sep = "\t") #1 ind = (tcrdist_clone_df.epitope == "PA") | (tcrdist_clone_df.epitope == "F2") tcrdist_clone_df = tcrdist_clone_df[ind].copy() mapping = mappers.tcrdist_clone_df_to_tcrdist2_mapping #3 tcrdist2_df = mappers.generic_pandas_mapper(df = tcrdist_clone_df, #4 mapping = mapping) #1 tr = TCRrep(cell_df = tcrdist2_df, organism = "mouse") #2 tr.infer_cdrs_from_v_gene(chain = 'alpha', imgt_aligned=True) tr.infer_cdrs_from_v_gene(chain = 'beta', imgt_aligned=True) #3 tr.index_cols = ['clone_id', 'subject', 'epitope', 'v_a_gene', 'j_a_gene', 'v_b_gene', 'j_b_gene', 'cdr3_a_aa', 'cdr3_b_aa', 'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa', 'cdr3_b_nucseq', 'cdr3_a_nucseq', 'va_countreps', 'ja_countreps', 'vb_countreps', 'jb_countreps', 'va_gene', 'vb_gene', 'ja_gene', 'jb_gene'] #4 tr.deduplicate() #5 tr._tcrdist_legacy_method_alpha_beta() #print(type(tr.cdr3_a_aa_pw[1,1])) assert isinstance(tr.cdr3_a_aa_pw[1,1], np.int) assert isinstance(tr.cdr3_b_aa_pw[1,1], np.int) tr.reduce_file_size() assert isinstance(tr.cdr3_a_aa_pw[1,1], np.int16) assert isinstance(tr.cdr3_b_aa_pw[1,1], np.int16)
def generate_tr(): import pandas as pd import numpy as np import tcrdist as td from tcrdist import mappers from tcrdist.repertoire import TCRrep tcrdist_clone_fn = 'tcrdist/test_files/mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones.tsv' tcrdist_clone_df = pd.read_csv(tcrdist_clone_fn, sep = "\t") #1 ind = (tcrdist_clone_df.epitope == "PA") | (tcrdist_clone_df.epitope == "F2") tcrdist_clone_df = tcrdist_clone_df[ind].copy() mapping = mappers.tcrdist_clone_df_to_tcrdist2_mapping #3 tcrdist2_df = mappers.generic_pandas_mapper(df = tcrdist_clone_df, #4 mapping = mapping) #1 tr = TCRrep(cell_df = tcrdist2_df, organism = "mouse") #2 tr.infer_cdrs_from_v_gene(chain = 'alpha', imgt_aligned=True) tr.infer_cdrs_from_v_gene(chain = 'beta', imgt_aligned=True) #3 tr.index_cols = ['clone_id', 'subject', 'epitope', 'v_a_gene', 'j_a_gene', 'v_b_gene', 'j_b_gene', 'cdr3_a_aa', 'cdr3_b_aa', 'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa', 'cdr3_b_nucseq', 'cdr3_a_nucseq', 'va_countreps', 'ja_countreps', 'vb_countreps', 'jb_countreps', 'va_gene', 'vb_gene', 'ja_gene', 'jb_gene'] #4 tr.deduplicate() #5 tr._tcrdist_legacy_method_alpha_beta() return tr
def test_CompleteExample_with_TCRMotif_Invoked_From_within_TCRsubset(): import pandas as pd import numpy as np import tcrdist as td #import IPython from tcrdist import mappers from tcrdist.repertoire import TCRrep from tcrdist.cdr3_motif import TCRMotif from tcrdist.subset import TCRsubset from tcrdist.storage import StoreIOMotif, StoreIOEntropy from tcrdist.plotting import plot_pwm tcrdist_clone_fn = 'tcrdist/test_files/mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones.tsv' tcrdist_clone_df = pd.read_csv(tcrdist_clone_fn, sep="\t") #1 ind = (tcrdist_clone_df.epitope == "PA") | (tcrdist_clone_df.epitope == "F2") tcrdist_clone_df = tcrdist_clone_df[ind].copy() mapping = mappers.tcrdist_clone_df_to_tcrdist2_mapping #3 tcrdist2_df = mappers.generic_pandas_mapper( df=tcrdist_clone_df, #4 mapping=mapping) #1 tr = TCRrep(cell_df=tcrdist2_df, organism="mouse") #2 tr.infer_cdrs_from_v_gene(chain='alpha', imgt_aligned=True) tr.infer_cdrs_from_v_gene(chain='beta', imgt_aligned=True) #3 tr.index_cols = [ 'clone_id', 'subject', 'epitope', 'v_a_gene', 'j_a_gene', 'v_b_gene', 'j_b_gene', 'cdr3_a_aa', 'cdr3_b_aa', 'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa', 'cdr3_b_nucseq', 'cdr3_a_nucseq', 'va_countreps', 'ja_countreps', 'vb_countreps', 'jb_countreps', 'va_gene', 'vb_gene', 'ja_gene', 'jb_gene' ] #4 tr.deduplicate() #5 tr._tcrdist_legacy_method_alpha_beta() #6 distA = tr.dist_a distB = tr.dist_b assert np.all(((distA + distB) - tr.paired_tcrdist) == 0) # 1 criteria = tr.clone_df.epitope == "PA" clone_df_subset = tr.clone_df[criteria] # 2 distA_subset = distA.loc[clone_df_subset.clone_id, clone_df_subset.clone_id].copy() distB_subset = distB.loc[clone_df_subset.clone_id, clone_df_subset.clone_id].copy() # 3 ts = TCRsubset(clone_df_subset, organism="mouse", epitopes=["PA"], epitope="PA", chains=["A", "B"], dist_a=distA_subset, dist_b=distB_subset) # ts.find_motif() cnames = [ "file_type", "count", "expect_random", "expect_nextgen", "chi_squared", "nfixed", "showmotif", "num", "othernum", "overlap", "ep", "ab", "nseqs", "v_rep_counts", "j_rep_counts" ] motif_fn = 'tcrdist/test_files/mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones_cdr3_motifs_PA.log' x = open(motif_fn, "r").readlines() ts.motif_df = pd.DataFrame([l.split() for l in x], columns=cnames) i = 0 row = ts.motif_df.iloc[i, :].to_dict() motif_list = list() motif_logo = list() for i, row in ts.motif_df.iterrows(): StoreIOMotif_instance = ts.eval_motif(row) motif_list.append(StoreIOMotif_instance) motif_logo.append( plot_pwm(StoreIOMotif_instance, create_file=False, my_height=200, my_width=600)) if i > 1: break
def test_Complete_Performance_Example(): import pandas as pd import numpy as np import tcrdist as td from collections import namedtuple from tcrdist import mappers from tcrdist.repertoire import TCRrep tcrdist_clone_fn = 'tcrdist/test_files/mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones.tsv' tcrdist_clone_df = pd.read_csv(tcrdist_clone_fn, sep="\t") mapping = mappers.tcrdist_clone_df_to_tcrdist2_mapping tcrdist2_df = mappers.generic_pandas_mapper(df=tcrdist_clone_df, mapping=mapping) tr = TCRrep(cell_df=tcrdist2_df, organism="mouse") tr.infer_cdrs_from_v_gene(chain='alpha', imgt_aligned=True) tr.infer_cdrs_from_v_gene(chain='beta', imgt_aligned=True) tr.index_cols = [ 'clone_id', 'subject', 'epitope', 'v_a_gene', 'j_a_gene', 'v_b_gene', 'j_b_gene', 'cdr3_a_aa', 'cdr3_b_aa', 'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa', 'cdr3_b_nucseq', 'cdr3_a_nucseq', 'va_countreps', 'ja_countreps', 'vb_countreps', 'jb_countreps', 'va_gene', 'vb_gene', 'ja_gene', 'jb_gene' ] tr.deduplicate() tr._tcrdist_legacy_method_alpha_beta(processes=1) distA = tr.dist_a distB = tr.dist_b assert np.all(((distA + distB) - tr.paired_tcrdist) == 0) # K NEAREST NEIGHBORS pr = namedtuple("perf", ["observed", "predicted", "dist"]) obsereved = tr.clone_df.epitope.to_list() performance = list() k = 5 for i, row in tr.clone_df.iterrows(): ind = (tr.clone_df.subject != row.subject ) # Index hold out all data from that subject distances = tr.paired_tcrdist[ i, ind] # Get Distances from the ith row, holding out subject sorted_indices = np.argsort( distances) # Get index of storted distances small to large sorted_epitopes = tr.clone_df.epitope.iloc[sorted_indices].to_list( ) # Get epitopes associated wtih those indices sorted_distances = distances[ sorted_indices] # Get distances associated with those neighbors predicted = sorted_epitopes[ 0:k] # Get Predicted epitopes for K nearest neighbors predicted_distance = sorted_distances[ 0:k] # Get distances for K nearest neighbots performance.append( pr(obsereved[i], predicted, predicted_distance)) # Save Performance Information performance[1:10]