示例#1
0
def get_TRBC(FASTA_DIR):
    # get the sequence representation for each TRA and TRB gene segment
    # I'll just select the first allele for each sequence first
    trbc_fasta = "/".join([FASTA_DIR, "TRBC.fasta"])
    trb_c = Fasta.Fasta(trbc_fasta)

    trb_c_dict = {}

    trb_c_set = set()
    trb_count = 0
    for _fas in trb_c:
        # parse the header
        # select BL6
        parse_head = _fas.header.split("|")
        trb_gene = parse_head[1].split("*")[0]
        trb_pos = parse_head[5]

        if trb_gene not in trb_c_set:
            trb_c_set.add(trb_gene)
            trb_c_dict[trb_gene] = {
                "Sequence": _fas.sequence,
                "Gene": trb_gene,
                "Length": _fas.length
            }
            trb_count += 1
    return (trb_c_dict)
示例#2
0
def create_dataset(n_success, fasta_dir, outfile):
    '''
    Create a data set of `n_success` valid TCRs, record valid and failed chains    
    '''

    valid_counter = 0

    alpha_v = []
    beta_v = []

    alpha_len = []
    beta_len = []

    alpha_diversity = []
    beta_diversity = []

    alpha_j = []
    beta_j = []

    alpha_aa = []
    beta_aa = []

    alpha_pep = []
    beta_pep = []

    alpha_keep = []
    beta_keep = []

    alpha_cdr1 = []
    alpha_cdr2 = []
    alpha_cdr3 = []

    beta_cdr1 = []
    beta_cdr2 = []
    beta_cdr3 = []

    alpha_cdr1_diversity = []
    alpha_cdr2_diversity = []
    alpha_cdr3_diversity = []

    beta_cdr1_diversity = []
    beta_cdr2_diversity = []
    beta_cdr3_diversity = []

    valid_tcr_chains = []

    with open(outfile, "wt") as ofile:
        ofile.write(
            "TRA_V\tTRA_J\tTRA_len\tTRA_entropy\tTRA_AA\tTRA_Peptide\tTRA_Valid\tTRA_CDR1_entropy\tTRA_CDR1\tTRA_CDR2_entropy\tTRA_CDR2\tTRA_CDR3_entropy\tTRA_CDR3\tTRB_V\tTRB_J\tTRB_len\tTRB_entropy\tTRB_AA\tTRB_Peptide\tTRB_Valid\tTRB_CDR1_entropy\tTRB_CDR1\tTRB_CDR2_entropy\tTRB_CDR2\tTRB_CDR3_entropy\tTRB_CDR3\tValid.TCR\n"
        )
        while valid_counter < n_success:
            tcr_chains = generate_TCR(1, fasta_dir)

            for x in tcr_chains:
                tcra = x['TCRA']
                tcrb = x['TCRB']

                alpha_v.append(tcra['V'])
                alpha_j.append(tcra['J'])
                alpha_len.append(len(tcra['RNA']))
                # the entropy calculation should be for the CDR3 region or the V(D)J region only
                # it should also be the translated AA sequence, not the nucleotide
                alpha_rna_record = Fasta.FastaRecordPeptide(
                    header='', sequence=tcra['AA'])
                alpha_n_count = pd.Series(alpha_rna_record.counts)
                alpha_diversity.append(
                    stats.entropy(alpha_n_count / alpha_n_count.sum()))
                alpha_aa.append(len(tcra['AA']))
                alpha_pep.append(tcra['AA'])
                alpha_keep.append(tcra["Valid.Chain"])

                try:
                    # find the valid CDR regions if possible
                    # calculate the CDR entropies
                    alpha_cdr1_aa = get_CDR(tcra['AA'], 1)
                    alpha_aa_cdr1 = Fasta.FastaRecordPeptide(
                        header='', sequence=alpha_cdr1_aa)
                    alpha_cdr1_count = pd.Series(alpha_aa_cdr1.counts)
                    alpha_cdr1_en = stats.entropy(alpha_cdr1_count /
                                                  alpha_cdr1_count.sum())
                    alpha_cdr1_diversity.append(alpha_cdr1_en)

                    alpha_cdr2_aa = get_CDR(tcra['AA'], 2)
                    alpha_aa_cdr2 = Fasta.FastaRecordPeptide(
                        header='', sequence=alpha_cdr2_aa)
                    alpha_cdr2_count = pd.Series(alpha_aa_cdr2.counts)
                    alpha_cdr2_en = stats.entropy(alpha_cdr2_count /
                                                  alpha_cdr2_count.sum())
                    alpha_cdr2_diversity.append(alpha_cdr2_en)

                    try:
                        alpha_cdr3_aa = get_CDR(tcra['AA'], 3)
                        alpha_aa_cdr3 = Fasta.FastaRecordPeptide(
                            header='', sequence=alpha_cdr3_aa)
                        alpha_cdr3_count = pd.Series(alpha_aa_cdr3.counts)
                        alpha_cdr3_en = stats.entropy(alpha_cdr3_count /
                                                      alpha_cdr3_count.sum())
                        alpha_cdr3_diversity.append(alpha_cdr3_en)
                    except UnboundLocalError:
                        alpha_cdr3_aa = ""
                        alpha_cdr3_en = np.nan
                        alpha_cdr3_diversity.append(alpha_cdr3_en)
                except TypeError:
                    alpha_cdr1_en = np.nan
                    alpha_cdr2_en = np.nan
                    alpha_cdr3_en = np.nan
                    alpha_cdr1_diversity.append(alpha_cdr1_en)
                    alpha_cdr2_diversity.append(alpha_cdr2_en)
                    alpha_cdr3_diversity.append(alpha_cdr3_en)

                alpha_cdr1.append(alpha_cdr1_aa)
                alpha_cdr2.append(alpha_cdr2_aa)
                alpha_cdr3.append(alpha_cdr3_aa)

                beta_v.append(tcrb['V'])
                beta_j.append(tcrb['J'])
                beta_len.append(len(tcrb['RNA']))
                beta_rna_record = Fasta.FastaRecordPeptide(header='',
                                                           sequence=tcrb['AA'])
                beta_n_count = pd.Series(beta_rna_record.counts)
                beta_diversity.append(
                    stats.entropy(beta_n_count / beta_n_count.sum()))
                beta_aa.append(len(tcrb['AA']))
                beta_pep.append(tcrb['AA'])
                beta_keep.append(tcrb["Valid.Chain"])

                try:
                    # calculate the CDR entropies
                    beta_cdr1_aa = get_CDR(tcrb['AA'], 1)
                    beta_aa_cdr1 = Fasta.FastaRecordPeptide(
                        header='', sequence=beta_cdr1_aa)
                    beta_cdr1_count = pd.Series(beta_aa_cdr1.counts)
                    beta_cdr1_en = stats.entropy(beta_cdr1_count /
                                                 beta_cdr1_count.sum())
                    beta_cdr1_diversity.append(beta_cdr1_en)

                    beta_cdr2_aa = get_CDR(tcrb['AA'], 2)
                    beta_aa_cdr2 = Fasta.FastaRecordPeptide(
                        header='', sequence=beta_cdr2_aa)
                    beta_cdr2_count = pd.Series(beta_aa_cdr2.counts)
                    beta_cdr2_en = stats.entropy(beta_cdr2_count /
                                                 beta_cdr2_count.sum())
                    beta_cdr2_diversity.append(beta_cdr2_en)

                    try:
                        beta_cdr3_aa = get_CDR(tcrb['AA'], 3)
                        beta_aa_cdr3 = Fasta.FastaRecordPeptide(
                            header='', sequence=beta_cdr3_aa)
                        beta_cdr3_count = pd.Series(beta_aa_cdr3.counts)
                        beta_cdr3_en = stats.entropy(beta_cdr3_count /
                                                     beta_cdr3_count.sum())
                        beta_cdr3_diversity.append(beta_cdr3_en)
                    except UnboundLocalError:
                        beta_cdr3_aa = ""
                        beta_cdr3_en = np.nan
                        beta_cdr3_diversity.append(beta_cdr3_en)
                except TypeError:
                    beta_cdr1_en = np.nan
                    beta_cdr2_en = np.nan
                    beta_cdr3_en = np.nan
                    beta_cdr1_diversity.append(beta_cdr1_en)
                    beta_cdr2_diversity.append(beta_cdr2_en)
                    beta_cdr3_diversity.append(beta_cdr3_en)

                beta_cdr1.append(beta_cdr1_aa)
                beta_cdr2.append(beta_cdr2_aa)
                beta_cdr3.append(beta_cdr3_aa)

                # check for validity
                if tcra["Valid.Chain"] and tcrb["Valid.Chain"]:
                    valid_counter += 1
                    validchain = True
                    valid_tcr_chains.append(True)
                else:
                    validchain = False
                    valid_tcr_chains.append(False)
                ofile.write(
                    "{}\t{}\t{}\t{}\t{}\t\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n"
                    .format(tcra['V'], tcra['J'], len(tcra['RNA']),
                            tcra['RNA'],
                            stats.entropy(alpha_n_count / alpha_n_count.sum()),
                            len(tcra['AA']), tcra['AA'], tcra["Valid.Chain"],
                            alpha_cdr1_en, alpha_cdr1_aa, alpha_cdr2_en,
                            alpha_cdr2_aa, alpha_cdr3_en,
                            alpha_cdr3_aa, tcrb['V'], tcrb['J'],
                            len(tcrb['RNA']), tcrb['RNA'],
                            stats.entropy(beta_n_count / beta_n_count.sum()),
                            len(tcrb['AA']), tcrb['AA'], tcrb["Valid.Chain"],
                            beta_cdr1_en, beta_cdr1_aa, beta_cdr2_en,
                            beta_cdr2_aa, beta_cdr3_en, beta_cdr3_aa,
                            validchain))