Exemplo n.º 1
0
def _organize_calls(out_file, hla_base, data):
    """Prepare genotype calls, reporting best call along with quality metrics.
    """
    hla_truth = get_hla_truthset(data)
    align_file = dd.get_align_bam(data)
    sample = dd.get_sample_name(data)
    with file_transaction(data, out_file) as tx_out_file:
        with open(tx_out_file, "w") as out_handle:
            writer = csv.writer(out_handle)
            writer.writerow(["sample", "locus", "mismatches", "options", "alleles", "p-groups", "expected",
                             "validates"])
            for genotype_file in glob.glob("%s.HLA-*.gt" % (hla_base)):
                hla_locus = os.path.basename(genotype_file).replace(
                        "%s.hla.HLA-" % os.path.basename(align_file), "").replace(".gt", "")
                with open(genotype_file) as in_handle:
                    total_options = set([])
                    for i, line in enumerate(in_handle):
                        _, aone, atwo, m = line.split("\t")[:4]
                        pgroups = (hla_groups.hla_protein(aone, data), hla_groups.hla_protein(atwo, data))
                        if i == 0:
                            call_alleles = [aone, atwo]
                            call_pgroups = pgroups
                            mismatches = m
                        total_options.add(pgroups)
                    if len(total_options) > 0:
                        truth_alleles = tz.get_in([sample, hla_locus], hla_truth, [])
                        writer.writerow([sample, hla_locus, mismatches, len(total_options),
                                         ";".join(call_alleles), ";".join(call_pgroups),
                                         ";".join(truth_alleles), _matches_truth(call_alleles, truth_alleles, data)])
    return out_file
Exemplo n.º 2
0
def _organize_calls(out_file, hla_base, data):
    """Prepare genotype calls, reporting best call along with quality metrics.
    """
    hla_truth = get_hla_truthset(data)
    sample = dd.get_sample_name(data)
    with file_transaction(data, out_file) as tx_out_file:
        with open(tx_out_file, "w") as out_handle:
            writer = csv.writer(out_handle)
            writer.writerow(["sample", "locus", "mismatches", "options", "alleles", "p-groups", "expected",
                             "validates"])
            for genotype_file in glob.glob("%s.HLA-*.gt" % (hla_base)):
                hla_locus = os.path.basename(genotype_file).replace(
                        "%s.HLA-" % os.path.basename(hla_base), "").replace(".gt", "")
                with open(genotype_file) as in_handle:
                    total_options = set([])
                    for i, line in enumerate(in_handle):
                        _, aone, atwo, m = line.split("\t")[:4]
                        pgroups = (hla_groups.hla_protein(aone, data), hla_groups.hla_protein(atwo, data))
                        if i == 0:
                            call_alleles = [aone, atwo]
                            call_pgroups = pgroups
                            mismatches = m
                        total_options.add(pgroups)
                    if len(total_options) > 0:
                        truth_alleles = tz.get_in([sample, hla_locus], hla_truth, [])
                        writer.writerow([sample, hla_locus, mismatches, len(total_options),
                                         ";".join(call_alleles), ";".join(call_pgroups),
                                         ";".join(truth_alleles), matches_truth(call_alleles, truth_alleles, data)])
    return out_file
Exemplo n.º 3
0
def _matches_truth(call_alleles, truth_alleles, data):
    """Flexibly check if truth and call alleles match, using p-groups.
    """
    if not truth_alleles:
        return ""
    else:
        call_pgroups = [hla_groups.hla_protein(x, data) for x in call_alleles]
        t_cmp = set([hla_groups.hla_protein(x, data) for x in truth_alleles])
        c_cmp = set(call_pgroups + [x[:-1] for x in call_pgroups if x.endswith("P")])
        return "yes" if len(t_cmp.intersection(c_cmp)) == len(set(call_pgroups)) else "no"
Exemplo n.º 4
0
def matches_truth(call_alleles, truth_alleles, data):
    """Flexibly check if truth and call alleles match, using p-groups.
    """
    if not truth_alleles:
        return ""
    else:
        def _remove_p(x):
            return x[:-1] if x.endswith("P") else x
        t_cmp = set([_remove_p(hla_groups.hla_protein(x, data)) for x in truth_alleles])
        c_cmp = set([_remove_p(hla_groups.hla_protein(x, data)) for x in call_alleles])
        return "yes" if len(t_cmp.intersection(c_cmp)) == len(t_cmp) else "no"
Exemplo n.º 5
0
def _matches_truth(call_alleles, truth_alleles, data):
    """Flexibly check if truth and call alleles match, using p-groups.
    """
    if not truth_alleles:
        return ""
    else:
        call_pgroups = [hla_groups.hla_protein(x, data) for x in call_alleles]
        t_cmp = set([hla_groups.hla_protein(x, data) for x in truth_alleles])
        c_cmp = set(call_pgroups +
                    [x[:-1] for x in call_pgroups if x.endswith("P")])
        return "yes" if len(t_cmp.intersection(c_cmp)) == len(
            set(call_pgroups)) else "no"