def _organize_calls(out_file, hla_base, data): """Prepare genotype calls, reporting best call along with quality metrics. """ hla_truth = get_hla_truthset(data) align_file = dd.get_align_bam(data) sample = dd.get_sample_name(data) with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow(["sample", "locus", "mismatches", "options", "alleles", "p-groups", "expected", "validates"]) for genotype_file in glob.glob("%s.HLA-*.gt" % (hla_base)): hla_locus = os.path.basename(genotype_file).replace( "%s.hla.HLA-" % os.path.basename(align_file), "").replace(".gt", "") with open(genotype_file) as in_handle: total_options = set([]) for i, line in enumerate(in_handle): _, aone, atwo, m = line.split("\t")[:4] pgroups = (hla_groups.hla_protein(aone, data), hla_groups.hla_protein(atwo, data)) if i == 0: call_alleles = [aone, atwo] call_pgroups = pgroups mismatches = m total_options.add(pgroups) if len(total_options) > 0: truth_alleles = tz.get_in([sample, hla_locus], hla_truth, []) writer.writerow([sample, hla_locus, mismatches, len(total_options), ";".join(call_alleles), ";".join(call_pgroups), ";".join(truth_alleles), _matches_truth(call_alleles, truth_alleles, data)]) return out_file
def _organize_calls(out_file, hla_base, data): """Prepare genotype calls, reporting best call along with quality metrics. """ hla_truth = get_hla_truthset(data) sample = dd.get_sample_name(data) with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow(["sample", "locus", "mismatches", "options", "alleles", "p-groups", "expected", "validates"]) for genotype_file in glob.glob("%s.HLA-*.gt" % (hla_base)): hla_locus = os.path.basename(genotype_file).replace( "%s.HLA-" % os.path.basename(hla_base), "").replace(".gt", "") with open(genotype_file) as in_handle: total_options = set([]) for i, line in enumerate(in_handle): _, aone, atwo, m = line.split("\t")[:4] pgroups = (hla_groups.hla_protein(aone, data), hla_groups.hla_protein(atwo, data)) if i == 0: call_alleles = [aone, atwo] call_pgroups = pgroups mismatches = m total_options.add(pgroups) if len(total_options) > 0: truth_alleles = tz.get_in([sample, hla_locus], hla_truth, []) writer.writerow([sample, hla_locus, mismatches, len(total_options), ";".join(call_alleles), ";".join(call_pgroups), ";".join(truth_alleles), matches_truth(call_alleles, truth_alleles, data)]) return out_file
def _matches_truth(call_alleles, truth_alleles, data): """Flexibly check if truth and call alleles match, using p-groups. """ if not truth_alleles: return "" else: call_pgroups = [hla_groups.hla_protein(x, data) for x in call_alleles] t_cmp = set([hla_groups.hla_protein(x, data) for x in truth_alleles]) c_cmp = set(call_pgroups + [x[:-1] for x in call_pgroups if x.endswith("P")]) return "yes" if len(t_cmp.intersection(c_cmp)) == len(set(call_pgroups)) else "no"
def matches_truth(call_alleles, truth_alleles, data): """Flexibly check if truth and call alleles match, using p-groups. """ if not truth_alleles: return "" else: def _remove_p(x): return x[:-1] if x.endswith("P") else x t_cmp = set([_remove_p(hla_groups.hla_protein(x, data)) for x in truth_alleles]) c_cmp = set([_remove_p(hla_groups.hla_protein(x, data)) for x in call_alleles]) return "yes" if len(t_cmp.intersection(c_cmp)) == len(t_cmp) else "no"
def _matches_truth(call_alleles, truth_alleles, data): """Flexibly check if truth and call alleles match, using p-groups. """ if not truth_alleles: return "" else: call_pgroups = [hla_groups.hla_protein(x, data) for x in call_alleles] t_cmp = set([hla_groups.hla_protein(x, data) for x in truth_alleles]) c_cmp = set(call_pgroups + [x[:-1] for x in call_pgroups if x.endswith("P")]) return "yes" if len(t_cmp.intersection(c_cmp)) == len( set(call_pgroups)) else "no"