Exemplo n.º 1
0
def test_mixcr_to_tcrdist_on_clones():
    test_clones = os.path.join('tcrdist', 'test_files_compact',
                               'SRR5130260.1.test.fastq.output.clns.txt')
    df = mixcr.mixcr_to_tcrdist2(chain="delta",
                                 organism="human",
                                 clones_fn=test_clones)

    assert isinstance(df, pd.DataFrame)
    df1 = mixcr.remove_entries_with_invalid_vgene(df,
                                                  chain="delta",
                                                  organism="human")
    assert isinstance(df, pd.DataFrame)
    df1['subject'] = 'SRR5130260.1'

    tr = TCRrep(cell_df=df1,
                organism="human",
                chains=['delta'],
                db_file='gammadelta_db.tsv')
    print(tr.cell_df.shape[0])

    tr.infer_cdrs_from_v_gene(chain='delta', imgt_aligned=True)

    tr.index_cols = [
        'subject', "v_d_gene", 'd_d_gene', 'j_d_gene', 'cdr3_d_nucseq',
        'cdr3_d_aa', 'cdr1_d_aa', 'cdr2_d_aa', 'pmhc_d_aa'
    ]

    tr.deduplicate()
    assert isinstance(tr.clone_df, pd.DataFrame)
Exemplo n.º 2
0
def test_mixcr_integration_with_wrong_chain():
    test_clones_fn = os.path.join('tcrdist', 'test_files_compact',
                                  'SRR5130260.1.test.fastq.output.clns.txt')
    df = mixcr.mixcr_to_tcrdist2(chain="gamma",
                                 organism="human",
                                 seqs_fn=None,
                                 clones_fn=test_clones_fn)
    df2 = mixcr.remove_entries_with_invalid_vgene(df,
                                                  chain="gamma",
                                                  organism="human")
    assert df2.shape[0] == 0
Exemplo n.º 3
0
def test_mixcr_integration_with_correct_chain():
    test_clones_fn = os.path.join('tcrdist', 'test_files_compact',
                                  'SRR5130260.1.test.fastq.output.clns.txt')

    df = mixcr.mixcr_to_tcrdist2(chain="delta",
                                 organism="human",
                                 seqs_fn=None,
                                 clones_fn=test_clones_fn)
    assert isinstance(df, pd.DataFrame)
    df1 = mixcr.remove_entries_with_invalid_vgene(df,
                                                  chain="delta",
                                                  organism="human")
    assert isinstance(df, pd.DataFrame)
    assert df1.shape[0] == 89
Exemplo n.º 4
0
def test_convert_minervina_to_mixcr_run_tcrdist(f, my_chain):

    fn = os.path.join('tcrdist', 'test_files', f)
    df = pd.read_csv(fn, sep="\t")
    df['bestVGene'] = df['bestVGene'].apply(lambda s: s + "*00")
    df['bestJGene'] = df['bestJGene'].apply(lambda s: s + "*00")

    map_minervina_to_mixcr = \
        {'Rank':'cloneId',
        'Read.count':'cloneCount',
        'Read.proportion':'cloneFraction',
        'bestVGene': 'allVHitsWithScore',
        'bestDGene': 'allDHitsWithScore',
        'bestJGene':'allJHitsWithScore',
        'CDR3.nucleotide.sequence':'nSeqCDR3',
        'CDR3.amino.acid.sequence':'aaSeqCDR3',
        'refPoints':'refPoints'}

    df = df.rename(columns=map_minervina_to_mixcr)
    # CREATE A FAUX MIXCR OUTPUT
    df.to_csv('dfmix.clns.txt', index=False, sep="\t")
    # USE TCRDIST2 TOOL FOR PORTING MIXCR OUTPUTS
    dfmix = mixcr.mixcr_to_tcrdist2(chain=my_chain,
                                    organism="human",
                                    clones_fn='dfmix.clns.txt')

    if my_chain == "alpha":
        assert set(dfmix.columns.to_list()) == set([
            'clone_id', 'count', 'v_a_gene', 'd_a_gene', 'j_a_gene',
            'cdr3_a_nucseq', 'cdr3_a_aa'
        ])

    assert df.shape[0] == dfmix.shape[0]
    dfmix = mixcr.remove_entries_with_invalid_vgene(dfmix,
                                                    chain=my_chain,
                                                    organism="human")

    dfmix = mixcr.remove_entries_with_invalid_cdr3(dfmix, chain=my_chain)

    if my_chain == "alpha":
        tr = TCRrep(cell_df=dfmix, organism="human", chains=['alpha'])
    elif my_chain == "beta":
        tr = TCRrep(cell_df=dfmix, organism="human", chains=['beta'])

    tr.infer_cdrs_from_v_gene(chain=my_chain, imgt_aligned=True)
    tr.cell_df['subject'] = 'X'
    tr.cell_df['epitope'] = 'X'

    if my_chain == "alpha":
        tr.index_cols = [
            'clone_id', 'subject', 'epitope', 'v_a_gene', 'j_a_gene',
            'cdr3_a_aa', 'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa', 'cdr3_a_nucseq'
        ]

    elif my_chain == "beta":
        tr.index_cols = [
            'clone_id', 'subject', 'epitope', 'v_b_gene', 'j_b_gene',
            'cdr3_b_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa', 'cdr3_b_nucseq'
        ]

    tr.deduplicate()

    if my_chain == "alpha":
        tr._tcrdist_legacy_method_alpha()
        assert isinstance(tr.cdr3_a_aa_pw, np.ndarray)
        assert isinstance(tr.paired_tcrdist, np.ndarray)
    elif my_chain == "beta":
        tr._tcrdist_legacy_method_beta()
        assert isinstance(tr.cdr3_b_aa_pw, np.ndarray)
        assert isinstance(tr.paired_tcrdist, np.ndarray)
Exemplo n.º 5
0
def test_combine_betas_and_alphas():
    import pytest
    import os
    import numpy as np
    import pandas as pd
    from tcrdist.repertoire import TCRrep
    from tcrdist import mixcr
    import multiprocessing

    testfiles = [
        ('contracting_clones_M_alpha.tsv', 'alpha', 'M', 'contracting'),
        ('contracting_clones_M_beta.tsv', 'beta', 'M', 'contracting'),
        ('contracting_clones_W_alpha.tsv', 'alpha', 'W', 'contracting'),
        ('contracting_clones_W_beta.tsv', 'beta', 'W', 'contracting'),
        ('expanding_clones_M_alpha.tsv', 'alpha', 'M', 'expanding'),
        ('expanding_clones_M_beta.tsv', 'beta', 'M', 'expanding'),
        ('expanding_clones_W_alpha.tsv', 'alpha', 'W', 'expanding'),
        ('expanding_clones_W_beta.tsv', 'beta', 'W', 'expanding')
    ]

    betas = []
    alphas = []

    for f, my_chain, sub, group in testfiles:
        fn = os.path.join('tcrdist', 'test_files', f)
        df = pd.read_csv(fn, sep="\t")
        df['bestVGene'] = df['bestVGene'].apply(lambda s: s + "*00")
        df['bestJGene'] = df['bestJGene'].apply(lambda s: s + "*00")
        print(df.columns)
        map_minervina_to_mixcr = \
            {'Rank':'cloneId',
            'Read.count':'cloneCount',
            'Read.proportion':'cloneFraction',
            'bestVGene': 'allVHitsWithScore',
            'bestDGene': 'allDHitsWithScore',
            'bestJGene':'allJHitsWithScore',
            'CDR3.nucleotide.sequence':'nSeqCDR3',
            'CDR3.amino.acid.sequence':'aaSeqCDR3',
            'refPoints':'refPoints'}

        df = df.rename(columns=map_minervina_to_mixcr)
        # CREATE A FAUX MIXCR OUTPUT
        print(df.columns)
        df.to_csv('dfmix.clns.txt', index=False, sep="\t")
        dfmix = mixcr.mixcr_to_tcrdist2(chain=my_chain,
                                        organism="human",
                                        clones_fn='dfmix.clns.txt')
        dfmix['CD'] = df['CD'].copy()
        dfmix['proportion'] = df['cloneFraction'].copy()
        dfmix = mixcr.remove_entries_with_invalid_vgene(dfmix,
                                                        chain=my_chain,
                                                        organism="human")

        dfmix = mixcr.remove_entries_with_invalid_cdr3(dfmix, chain=my_chain)
        dfmix['source'] = f
        dfmix['subject'] = sub
        dfmix['epitope'] = 'X'
        dfmix['trajectory'] = group

        if my_chain == "alpha":
            alphas.append(dfmix)
        elif my_chain == "beta":
            betas.append(dfmix)

    betas_joined = pd.concat(betas)
    alpha_joined = pd.concat(alphas)

    testsets = [(alpha_joined, 'alpha'), (betas_joined, 'beta')]

    tcr_rep_results = dict()
    for dfmix, my_chain in testsets:
        if my_chain == "alpha":
            tr = TCRrep(cell_df=dfmix, organism="human", chains=['alpha'])
        elif my_chain == "beta":
            tr = TCRrep(cell_df=dfmix, organism="human", chains=['beta'])

        tr.infer_cdrs_from_v_gene(chain=my_chain, imgt_aligned=True)

        if my_chain == "alpha":
            tr.index_cols = [
                'clone_id', 'subject', 'epitope', 'trajectory', 'v_a_gene',
                'j_a_gene', 'cdr3_a_aa', 'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa',
                'cdr3_a_nucseq', 'CD', 'source'
            ]

        elif my_chain == "beta":
            tr.index_cols = [
                'clone_id', 'subject', 'epitope', 'trajectory', 'v_b_gene',
                'j_b_gene', 'cdr3_b_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa',
                'cdr3_b_nucseq', 'CD', 'source'
            ]

        tr.deduplicate()
        my_processes = 10
        if my_processes > multiprocessing.cpu_count():
            my_processes = multiprocessing.cpu_count()

        if my_chain == "alpha":
            tr._tcrdist_legacy_method_alpha(processes=my_processes)
            assert isinstance(tr.cdr3_a_aa_pw, np.ndarray)
            assert isinstance(tr.paired_tcrdist, np.ndarray)
        elif my_chain == "beta":
            tr._tcrdist_legacy_method_beta(processes=my_processes)
            assert isinstance(tr.cdr3_b_aa_pw, np.ndarray)
            assert isinstance(tr.paired_tcrdist, np.ndarray)

        tcr_rep_results[my_chain] = tr
    return tcr_rep_results