Пример #1
0
def test_get_bets_j_gene():
    from tcrdist.pairwise import hm_metric
    tc = TCRcodon(organism="human", db_file="gammadelta_db.tsv")
    df = pd.read_csv("tcrdist/test_files_compact/sant.csv")
    someseq = df['cdr3_d_aa'][1]
    x = tc.get_best_j_gene(aa_seq=someseq, verbose=True)
    assert x == 'TRDJ1*01'
    # test for all
    xx = [tc.get_best_j_gene(aa_seq=x, verbose=False) for x in df['cdr3_d_aa']]
    vc = pd.Series(xx).value_counts().to_dict()
    assert vc == {
        'TRDJ1*01': 271,
        'TRDJ3*01': 66,
        'TRDJ2*01': 20,
        'TRDJ4*01': 9
    }
Пример #2
0
def test_get_bets_j_gene_room_for_improvement():
    tc = TCRcodon(organism="mouse", db_file="alphabeta_db.tsv")
    df = clone_df_subset[[
        'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'cdr3_b_nucseq'
    ]]
    xx = [tc.get_best_j_gene(aa_seq=x, verbose=False) for x in df['cdr3_b_aa']]
    # THIS CAN'T RESOLVE TIES, OR WHO KNOW WHAT ELSE
    assert np.all(df['j_b_gene'][0:2] == xx[0:2])
    assert np.sum(df['j_b_gene'] == xx) > 20
Пример #3
0
def test_TCRcodon_smal_dataframe_gama_lots():
    tc = TCRcodon(organism="human", db_file="gammadelta_db.tsv")
    df = pd.read_csv("tcrdist/test_files_compact/sant.csv")
    df['j_d_gene'] = [
        tc.get_best_j_gene(aa_seq=x, verbose=False) for x in df['cdr3_d_aa']
    ]
    df = df[df['v_d_gene'].notna()].copy()
    syn_nucs = df.apply(lambda r: \
                        tc.guess_reverse_translation(\
                        r['v_d_gene'], r['j_d_gene'], r['cdr3_d_aa'],\
                        verbose = False), axis = 1)

    # Check that synthestic and real seqs are same length
    len_syn = [len(x) for x in syn_nucs]
    len_real = [3 * len(x) for x in df['cdr3_d_aa']]
    assert np.all(len_syn == len_real)
    assert np.all(len_syn == len_real)