示例#1
0
def sort_length(b1, b2):
    """
    Order entries from longest to shortest SVLEN, ties are by alphanumeric of REF
    """
    s1 = truvari.entry_size(b1)
    s2 = truvari.entry_size(b2)
    if s1 < s2:
        return 1
    if s1 > s2:
        return -1
    if b1.ref < b2.ref:
        return 1
    if b1.ref > b2.ref:
        return -1
    return 0
示例#2
0
    def filter_call(self, entry, base=False):
        """
        Returns True if the call should be filtered
        Base has different filtering requirements, so let the method know
        """
        size = truvari.entry_size(entry)
        if size > self.params.sizemax:
            return True

        if base and size < self.params.sizemin:
            return True

        if not base and size < self.params.sizefilt:
            return True

        samp = self.params.bSample if base else self.params.cSample
        prefix = 'b' if base else 'c'
        if self.params.no_ref in [
                "a", prefix
        ] and not truvari.entry_is_present(entry, samp):
            return True

        if self.params.passonly and truvari.entry_is_filtered(entry):
            return True

        return False
示例#3
0
    def run(self):
        """
        Find neighbors through a vcf
        """
        last_pos = None
        for entry in self.in_vcf:
            size = truvari.entry_size(entry)
            if not last_pos:
                last_pos = [entry.chrom, entry.start]

            if last_pos[0] == entry.chrom and last_pos[1] > entry.start:
                logging.error("File is not sorted %s:%d before %s:%d",
                              last_pos[0], last_pos[1], entry.chrom,
                              entry.start)
                sys.exit(1)

            if entry.chrom != last_pos[0]:
                self.chrom_end_flush()
                self.neigh_id += 1

            last_pos = [entry.chrom, entry.start]

            if size < self.sizemin or (self.passonly
                                       and truvari.filter_value(entry)):
                self.out_vcf.write(entry)
                continue
            # Make new range
            start, end = truvari.entry_boundaries(entry)
            cur_range = [start, end, entry, 0]

            self.flush_push_stack(cur_range)

        self.chrom_end_flush()
示例#4
0
def process_entries(ref_section):
    """
    Process vcf lines from a reference section
    """
    chrom, start, stop = ref_section
    logging.debug(f"Starting region {chrom}:{start}-{stop}")
    setproctitle(f"trf {chrom}:{start}-{stop}")
    vcf = pysam.VariantFile(trfshared.args.input)

    to_consider = []
    for entry in vcf.fetch(chrom, start, stop):
        # Prevent duplication
        if not (entry.start >= start and entry.start < stop):
            continue
        if truvari.entry_size(entry) >= trfshared.args.min_length:
            to_consider.append(entry)

    if not to_consider:
        return (chrom, start, stop, "")

    tanno = TRFAnno(executable=trfshared.args.executable,
                    trf_params=trfshared.args.trf_params)
    tanno.run_trf(to_consider)

    v = pysam.VariantFile(trfshared.args.input)
    new_header = edit_header(v.header)
    out = StringIO()
    decimal.getcontext().prec = 1
    for entry in v.fetch(chrom, start, stop):
        # Prevent duplication
        if not (entry.start >= start and entry.start < stop):
            continue
        if truvari.entry_size(entry) >= trfshared.args.min_length:
            key = f"{entry.chrom}:{entry.start}-{entry.stop}.{hash(entry.alts[0])}"
            entry = tanno.annotate(entry, key, new_header)
        out.write(str(entry))
    out.seek(0)
    setproctitle(f"trf done {chrom}:{start}-{stop}")
    logging.debug(f"Done region {chrom}:{start}-{stop}")
    return (chrom, start, stop, out.read())
示例#5
0
 def annotate_entry(self, entry):
     """
     Annotates entries in the vcf and writes to new vcf
     """
     if truvari.entry_size(entry) >= self.min_length:
         entry.translate(self.n_header)
         remap, hits = self.remap_entry(entry)
         entry.info["REMAP"] = remap
         if self.anno_hits and hits:
             entry.info["REMAPHits"] = [
                 _[1] for _ in hits[-self.anno_hits:]
             ]
     return entry
示例#6
0
 def annotate_entry(self, entry, hits):
     """
     Annotates a single entry with given hits
     """
     best_hit_pct = 0
     best_hit = None
     entry_size = truvari.entry_size(entry)
     for hit in hits:
         size_aln = abs(hit["RM_qstart"] - hit["RM_qend"]) + 1
         pct = size_aln / entry_size  # The TR that covers the most of the sequence
         # I'm taking the single best... So I might be 'under annotating'
         # Also, I might want to consider the score?
         if pct >= self.threshold and pct > best_hit_pct:
             best_hit_pct = pct
             best_hit = hit
     return self.edit_entry(entry, best_hit)
示例#7
0
def svinfo_main(cmdargs):
    """
    Main method
    """
    args = parse_args(cmdargs)
    vcf = pysam.VariantFile(args.input)
    n_header = edit_header(vcf)
    with pysam.VariantFile(args.output, 'w', header=n_header) as out:
        for entry in vcf:
            sz = truvari.entry_size(entry)
            if sz >= args.minsize:
                entry.translate(n_header)
                svtype = truvari.entry_variant_type(entry)
                entry.info["SVTYPE"] = svtype
                entry.info["SVLEN"] = sz
            out.write(entry)
    logging.info("Finished svinfo")
示例#8
0
    def get_pct(chrom, start, end):
        tot = 0
        homs = 0
        for entry in v.fetch(
                chrom, max(0, start - args.buffer),
                min(v.header.contigs[chrom].length, end + args.buffer)):
            if truvari.entry_size(entry) > args.maxgt:
                continue
            if truvari.get_gt(entry.samples[0]["GT"]).name == "HOM":
                homs += 1
            tot += 1
        if tot < args.mincount:
            return None

        if tot == 0:
            return float('nan')

        return float(format((homs / tot) * 100, ".1f"))
示例#9
0
def bpovl_main(cmdargs):
    """
    Main method
    """
    args = parse_args(cmdargs)
    in_vcf = pysam.VariantFile(args.input)
    anno_tree, anno_cnt = truvari.build_anno_tree(args.anno, *args.anno_psets)
    out_rows = []
    logging.info("Loaded %d annotations", anno_cnt)
    hit_cnt = 0
    for entry in in_vcf:
        has_hit = False

        start, end = truvari.entry_boundaries(entry)
        span = abs(end - start)
        if span > args.spanmax:
            continue
        svlen = truvari.entry_size(entry)
        if svlen < args.sizemin:
            continue

        key = truvari.entry_to_key(entry)
        for anno_idx in anno_tree[entry.chrom].at(start):
            has_hit = True
            out_rows.append([key, 'start_bnd', anno_idx.data])

        for anno_idx in anno_tree[entry.chrom].at(end):
            has_hit = True
            out_rows.append([key, 'end_bnd', anno_idx.data])

        for anno_idx in anno_tree[entry.chrom].overlap(start, end):
            if start <= anno_idx.begin and anno_idx.end <= end:
                has_hit = True
                out_rows.append([key, 'overlaps', anno_idx.data])
            elif anno_idx.begin <= start and end <= anno_idx.end:
                has_hit = True
                out_rows.append([key, 'contains', anno_idx.data])
        hit_cnt += has_hit
    logging.info("%d SVs hit annotations", hit_cnt)
    out = pd.DataFrame(out_rows, columns=["vcf_key",
                                          "intersection",
                                          "anno_key"])
    joblib.dump(out, args.output)
    logging.info("Finished bpovl")
示例#10
0
def hompct_main(cmd_args):
    """
    Main
    """
    args = parse_args(cmd_args)

    v = pysam.VariantFile(args.input)

    def get_pct(chrom, start, end):
        tot = 0
        homs = 0
        for entry in v.fetch(
                chrom, max(0, start - args.buffer),
                min(v.header.contigs[chrom].length, end + args.buffer)):
            if truvari.entry_size(entry) > args.maxgt:
                continue
            if truvari.get_gt(entry.samples[0]["GT"]).name == "HOM":
                homs += 1
            tot += 1
        if tot < args.mincount:
            return None

        if tot == 0:
            return float('nan')

        return float(format((homs / tot) * 100, ".1f"))

    header = v.header.copy()
    header.add_line((
        '##INFO=<ID=HOMPCT,Number=1,Type=Float,'  # pylint: disable=consider-using-f-string
        'Description="Percent of calls < %dbp long within %dbp that are homozygous">'
    ) % (args.maxgt, args.buffer))

    out = pysam.VariantFile(args.output, 'w', header=header)
    v2 = pysam.VariantFile(args.input)
    for entry in v2:
        if truvari.entry_size(entry) >= args.minanno:
            entry.translate(header)
            anno = get_pct(entry.chrom, *truvari.entry_boundaries(entry))
            if anno is not None:
                entry.info["HOMPCT"] = anno
        out.write(entry)
    logging.info("Finished hompct")
示例#11
0
 def run(self):
     """
     The work
     """
     logging.info("Annotating VCF")
     # should probably 'batch' it instead of running individually
     with pysam.VariantFile(self.in_vcf) as vcf, \
             pysam.VariantFile(self.out_vcf, 'w', header=self.n_header) as output:
         for entry in vcf:
             sz = truvari.entry_size(entry)
             if sz < self.min_length:
                 output.write(entry)
                 continue
             svtype = truvari.entry_variant_type(entry)
             if svtype == "INS":
                 entry = self.annotate_entry(entry, entry.alts[0])
             elif svtype == "DEL":
                 entry = self.annotate_entry(entry, entry.ref)
             output.write(entry)
示例#12
0
def output_writer(call, outs, sizemin):
    """
    Annotate a MatchResults' entries, write to the apppropriate file in outs
    and do the stats counting.
    Writer is responsible for handling FPs between sizefilt-sizemin
    """
    box = outs["stats_box"]
    if call.base:
        box["base cnt"] += 1
        annotate_entry(call.base, call, outs['n_base_header'])
        if call.state:
            gtBase = str(call.base_gt)
            gtComp = str(call.comp_gt)
            box["gt_matrix"][gtBase][gtComp] += 1

            box["TP-base"] += 1
            outs["tpb_out"].write(call.base)
            if call.gt_match:
                box["TP-base_TP-gt"] += 1
            else:
                box["TP-base_FP-gt"] += 1
        else:
            box["FN"] += 1
            outs["fn_out"].write(call.base)

    if call.comp:
        annotate_entry(call.comp, call, outs['n_comp_header'])
        if call.state:
            box["call cnt"] += 1
            box["TP-call"] += 1
            outs["tpc_out"].write(call.comp)
            if call.gt_match:
                box["TP-call_TP-gt"] += 1
            else:
                box["TP-call_FP-gt"] += 1
        elif truvari.entry_size(call.comp) >= sizemin:
            # The if is because we don't count FPs between sizefilt-sizemin
            box["call cnt"] += 1
            box["FP"] += 1
            outs["fp_out"].write(call.comp)
示例#13
0
 def extract_seqs(self):
     """
     Create the fasta file of all the sequences
     Returns the fasta file name
     """
     ret = tempfile.NamedTemporaryFile(mode='w', delete=False) # pylint: disable=consider-using-with
     tot_cnt = 0
     cnt = 0
     cntbp = 0
     with pysam.VariantFile(self.in_vcf) as fh:
         for pos, entry in enumerate(fh):
             tot_cnt += 1
             entry_size = truvari.entry_size(entry)
             if self.min_length <= entry_size <= self.max_length:
                 cnt += 1
                 cntbp += entry_size
                 if truvari.entry_variant_type(entry) == "INS":
                     ret.write(f">{pos}\n{entry.alts[0]}\n")
                 else:
                     ret.write(f">{pos}\n{entry.ref}\n")
     logging.info(
         f"Extracted {cnt} sequences ({cntbp}bp) from {tot_cnt} entries")
     return ret.name
示例#14
0
def generate_stat_table(vcf_fn, args):
    """
    Given a vcf filename, create a numpy array with dimensions counting
    [SVTYPE, SZBINS, GT, QUALBINS]
    """

    vcf = pysam.VariantFile(vcf_fn)
    ret = {}
    for i in vcf.header.samples:
        ret[i] = numpy.zeros((len(SV), len(SZBINS), len(QUALBINS), len(GT)))
    ret["total"] = numpy.zeros((len(SV), len(SZBINS), len(QUALBINS)))
    for entry in vcf:
        sv = get_svtype(truvari.entry_variant_type(entry))
        sz = get_sizebin(truvari.entry_size(entry))
        if entry.qual is not None:
            qual, idx = get_scalebin(entry.qual, args.qmin, args.qmax)
        else:
            qual, idx = 0, 0
        for i in vcf.header.samples:
            gt = get_gt(entry.samples[i]["GT"])
            ret[i][sv.value, SZBINS.index(sz), idx, gt.value] += 1
        ret["total"][sv.value, SZBINS.index(sz), idx] += 1

    return ret
示例#15
0
def vcf_to_df(fn, with_info=True, with_fmt=True, sample=None):
    """
    Parse a vcf file and turn it into a dataframe.
    Tries its best to pull info/format tags from the sample information.
    For Formats with Number=G, append _ref, _het, _hom. For things with Number=R, append _ref, _alt.
    Specify which sample with its name or index in the VCF.

    :param `fn`: File name of VCF to open and turn into a DataFrame
    :type `fn`: string
    :param `with_info`:  Add the INFO fields from the VCF to the DataFrame columns
    :type `with_info`: boolean, optional
    :param `with_fmt`: Add the FORMAT fields from the VCF to the DataFrame columns
    :type `with_info`: boolean, optional
    :param `sample`: Sample from the VCF to parse. Only used when with_fmt==True
    :type `sample`: int/string, optional

    :return: Converted VCF
    :rtype: pandas.DataFrame

    Example
        >>> import truvari
        >>> df = truvari.vcf_to_df("repo_utils/test_files/input2.vcf.gz", True, True)
        >>> df.columns
        Index(['id', 'svtype', 'svlen', 'szbin', 'qual', 'filter', 'is_pass', 'QNAME',
               'QSTART', 'QSTRAND', 'SVTYPE', 'SVLEN', 'GT', 'PL_ref', 'PL_het',
               'PL_hom', 'AD_ref', 'AD_alt'],
              dtype='object')
    """
    v = pysam.VariantFile(fn)
    header = [
        "key", "id", "svtype", "svlen", "szbin", "qual", "filter", "is_pass"
    ]

    info_ops = []
    if with_info:
        info_header, info_ops = tags_to_ops(v.header.info.items())
        logging.debug(info_header)
        header.extend(info_header)

    fmt_ops = []
    if with_fmt:  # get all the format fields, and how to parse them from header, add them to the header
        fmt_header, fmt_ops = tags_to_ops(v.header.formats.items())
        logging.debug(fmt_header)
        if isinstance(sample, list):
            header.extend(
                [f'{s}_{f}' for s, f in itertools.product(sample, fmt_header)])
        else:
            header.extend(fmt_header)

        if sample is None:
            sample = v.header.samples[0]

        if not isinstance(sample, list):
            sample = [sample]
    else:
        sample = []

    rows = []
    for entry in v:
        varsize = truvari.entry_size(entry)
        filt = list(entry.filter)
        cur_row = [
            truvari.entry_to_key(entry), entry.id,
            truvari.entry_variant_type(entry), varsize,
            truvari.get_sizebin(varsize), entry.qual, filt, not filt
            or filt[0] == 'PASS'
        ]

        for i, op in info_ops:  # Need to make OPs for INFOS..
            cur_row.extend(op(entry.info, i))

        for samp in sample:
            for i, op in fmt_ops:
                cur_row.extend(op(entry.samples[samp], i))

        rows.append(cur_row)
    ret = pd.DataFrame(rows, columns=header)
    ret["szbin"] = ret["szbin"].astype(SZBINTYPE)
    ret["svtype"] = ret["svtype"].astype(SVTYTYPE)
    return ret.set_index("key")