def set_fwr4_columns(record, database): j_call = record["j_call"] if not j_call or record["locus"] not in ALLOWED_LOCI: return cdr3_ref_end = database.j_cdr3_end(record["j_call"], record["locus"]) cdr3_query_end = record["cdr3_end"] if cdr3_ref_end is None or not cdr3_query_end: return fwr4_nt = record["sequence"][cdr3_query_end : record["j_sequence_end"]] # This overwrites some existing columns record["fwr4_start"] = cdr3_query_end + 1 record["fwr4_end"] = record["j_sequence_end"] record["fwr4"] = fwr4_nt record["fwr4_aa"] = nt_to_aa(fwr4_nt) # Compute FR4 mutation rate on nucleotide level germline = database.j[record["j_call"]][ record["j_germline_start"] - 1 : record["j_germline_end"] ] dist = edit_distance(germline, fwr4_nt) record["FR4_SHM"] = 100.0 * dist / len(germline) # Compute FR4 amino acid mutation rate sequence_aa = record["fwr4_aa"] germline_aa = nt_to_aa(germline) dist = edit_distance(germline_aa, sequence_aa) record["J_aa_mut"] = 100.0 * dist / len(germline_aa)
def assert_cdr3_detection(chain, s): for amino_acids, sequence in split(s): for offset in range(3): target = sequence[offset:] match = find_cdr3(target, chain) assert match is not None assert nt_to_aa( target[match[0]:match[1]]) == amino_acids, (chain, amino_acids, offset)
def set_aa_mut_columns(record, database): """ Compute amino acid mutation rate for all regions on V and also for V itself as the sum of the regions (that is, excluding the CDR3) """ total_length = 0 total_dist = 0 n_regions = 0 for airr_col, region in ( ("fwr1", "FR1"), ("cdr1", "CDR1"), ("fwr2", "FR2"), ("cdr2", "CDR2"), ("fwr3", "FR3"), ): record[region + "_aa_mut"] = None start = record[airr_col + "_start"] end = record[airr_col + "_end"] if start is None or end is None: continue sequence_aa = nt_to_aa(record["sequence"][start - 1 : end]) germline_aa = database.v_regions_aa[record["v_call"]].get(region) if germline_aa is None: continue # Some FR1 alignments are reported with a frameshift by IgBLAST. By requiring that # reference and query lengths are identical, we can filter out these cases (and use # Hamming distance to get some speedup) if len(germline_aa) != len(sequence_aa): continue dist = hamming_distance(germline_aa, sequence_aa) mut_aa = dist / len(germline_aa) if mut_aa >= 0.8: # assume something went wrong continue total_dist += dist n_regions += 1 total_length += len(germline_aa) record[region + "_aa_mut"] = 100.0 * mut_aa if n_regions == 5: record["V_aa_mut"] = 100.0 * total_dist / total_length else: record["V_aa_mut"] = None
def set_cdr3_columns(record, database): if ( not record["v_call"] or not record["j_call"] or record["locus"] not in ALLOWED_LOCI ): return # CDR3 start cdr3_ref_start = database.v_cdr3_start( record["v_call"], record["locus"] ) if cdr3_ref_start is None: return cdr3_query_start = query_position(record, "v", reference_position=cdr3_ref_start) if cdr3_query_start is None: # Alignment is not long enough to cover CDR3 start position; try to rescue it # by assuming that the alignment would continue without indels. cdr3_query_start = record["v_sequence_end"] + ( cdr3_ref_start - record["v_germline_end"] ) # CDR3 end cdr3_ref_end = database.j_cdr3_end(record["j_call"], record["locus"]) if cdr3_ref_end is None: return cdr3_query_end = query_position(record, "j", reference_position=cdr3_ref_end) if cdr3_query_end is None: return cdr3_nt = record["sequence"][cdr3_query_start:cdr3_query_end] record["cdr3_start"] = cdr3_query_start + 1 record["cdr3_end"] = cdr3_query_end record["cdr3"] = cdr3_nt record["cdr3_aa"] = nt_to_aa(cdr3_nt)