예제 #1
0
def windowpairs_from_vcf(chrom, vcf_file_list, sv_type_list):
    '''
    Function generates chromosome wide window pairs using VCF files.
    One can specify the types of SVs within window pairs.

    :param chrom: List of chromosomes
    :param vcf_file_list: List of paths to VCF files (One per caller)
    :param sv_type_list: List of SVTypes (DEL,INV,BND,INS,DUP) for which windows are generated
    :return:
    '''

    window_pairs = set()

    for vcf_file in vcf_file_list:
        assert os.path.isfile(vcf_file)
        vcf_in = VariantFile(vcf_file, 'r')
        caller = re.findall(r'^\w*', vcf_file)
        lostSV_logfile = open("Excluded_SVs_" + caller[0] + ".log", 'w')
        lostSV_logfile.write(str(vcf_in.header) + "\n")
        for rec in vcf_in.fetch():
            svrec = SVRecord_generic(rec, caller[0])
            startCI = abs(svrec.cipos[0]) + svrec.cipos[1]
            endCI = abs(svrec.ciend[0]) + svrec.ciend[1]
            if startCI > 200 or endCI > 200 or svrec.start == svrec.end:
                lostSV_logfile.write(str(rec) + "\n")
            elif svrec.chrom == chrom and svrec.svtype in sv_type_list:
                window_pairs.add(
                    StructuralVariant(Breakpoint(svrec.chrom, svrec.start),
                                      Breakpoint(svrec.chrom, svrec.end)))

        vcf_in.close()

    lostSV_logfile.close()

    return window_pairs
예제 #2
0
def file_process(fname):
    try:
        cpath = fname.rstrip('\n')
        sys.stderr.write("Processing " + cpath + "\n")
        sys.stderr.flush()
        in_vcf = VariantFile(cpath)
        # pdb.set_trace()
        for cat in tbl_dict:
            for key in tbl_dict[cat]:
                getattr(in_vcf.header, cat)[key].remove_header()
                in_vcf.header.add_meta(cat_dict[cat],
                                       items=[('ID', key),
                                              ('Number',
                                               getattr(good_boy.header,
                                                       cat)[key].number),
                                              ('Type',
                                               getattr(good_boy.header,
                                                       cat)[key].type),
                                              ('Description',
                                               getattr(good_boy.header,
                                                       cat)[key].description)])
        # pdb.set_trace()
        out_vcf = VariantFile("-", 'w', header=in_vcf.header)
        for rec in in_vcf.fetch():
            out_vcf.write(rec)
        out_vcf.close()
    except Exception as e:
        sys.stderr.write(str(e) + "\n failed to process " + cpath + "\n")
예제 #3
0
def vcf_file_to_regions(in_file: Union[str, os.PathLike]):
    vcf = VariantFile(in_file, mode="r")
    try:  # VariantFile automatically opens file
        for variant in vcf:  # type: VariantRecord
            yield BedRegion(variant.contig, variant.start, variant.stop)
    finally:
        # Make sure vcf is always closed
        vcf.close()
예제 #4
0
class VcfAugmenter(ABC):
    def __init__(self, in_path, command_line, out_file=sys.stdout):
        """
        in_path -- Path to input VCF, used as template.
        command_line -- A string that will be added as a VCF header entry
            (use None to not add this to the VCF header)
        out_file -- Open file-like object to which VCF is written.
        tag -- which type of tag to write, either 'PS' or 'HP'. 'PS' is standardized;
            'HP' is compatible with GATK’s ReadBackedPhasing.
        """
        # TODO This is slow because it reads in the entire VCF one extra time
        contigs, formats, infos = missing_headers(in_path)
        # We repair the header (adding missing contigs, formats, infos) of the *input* VCF because
        # we will modify the records that we read, and these are associated with the input file.
        self._reader = VariantFile(in_path)
        augment_header(self._reader.header, contigs, formats, infos)
        if command_line is not None:
            command_line = '"' + command_line.replace('"', "") + '"'
            self._reader.header.add_meta("commandline", command_line)
        self.setup_header(self._reader.header)
        self._writer = VariantFile(out_file, mode="w", header=self._reader.header)
        self._unprocessed_record = None
        self._reader_iter = iter(self._reader)

    @abstractmethod
    def setup_header(self, header):
        pass

    def close(self):
        self._writer.close()

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    @property
    def samples(self):
        return list(self._reader.header.samples)

    def _iterrecords(self, chromosome):
        """Yield all records for the target chromosome"""
        n = 0
        if self._unprocessed_record is not None:
            assert self._unprocessed_record.chrom == chromosome
            yield self._unprocessed_record
            n += 1
        for record in self._reader_iter:
            n += 1
            if record.chrom != chromosome:
                # save it for later
                self._unprocessed_record = record
                assert n != 1
                return
            yield record
예제 #5
0
def to_arrow(vfname, batchparams, cols, nested_props=("FILTER", "FORMAT")):
    """Convert `VariantRecord` batches to Arrow `RecordBatch`es

    The returned Arrow buffer breaks compatibility with a standard VCF column
    header: ALT -> ALTS.  This is because `pysam.VariantRecord` does this, and
    it makes sense.  This also significantly reduces code complexity.

    The keys nested under the INFO column are completely free-form, so they are
    detected automatically from the VCF file header.  During conversion to
    Arrow buffer, filling these fields present a significant book-keeping
    challenge (they can also be nested!).  So we opt to fill these
    semi-automatically, and any absent fields are set to NULL (thanks to
    Arrow!).

    Note, while converting to other formats, these may need to be filled by
    reasonable alternatives; which might come at a cost.  For example, Pandas
    does not support NULLs, and a likely replacement would be numpy.na.  This
    means you firstly lose zero-copy conversion, and possibly convert the field
    type to a float!  Beware.

    vfname      -- Variant file name to be opened with `VariantFile`
    batchparams -- Parameters to get VariantRecord batch iterator
    cols        -- Record column spec (as returned by get_vcf_cols(..))

    returns list of `RecordBatch`es

    """
    batch = []
    vf = VariantFile(vfname, mode="r", threads=4)  # FIXME:
    for vrec in vf.fetch(*batchparams):
        # break compatibility with VCF file column header: ALT -> ALTS.
        # INFO_* fields are filtered out as they are handled separately later.
        row = OrderedDict((c, getattr(vrec, c.lower())) for c in cols
                          if c in _simple_vcf_cols)
        # vrec.{prop}: [('<filter>', <pysam.libcbcf.VariantMetadata>)]
        row.update((prop, [key for key in getattr(vrec, prop.lower()).keys()])
                   for prop in nested_props)
        # missing INFO_* fields are treated as NULLs (see doc string)
        row.update((f"INFO_{k}", v) for k, v in vrec.info.items())
        # reverse the layout: fmt in sample -> sample in fmt.  this way
        # for a given FORMAT field, all samples will be in adjacent blocks.
        row.update(
            (f"{fmt}_{sample.name}", (int(sample.phased), *sample.values()[i]))
            for i, fmt in enumerate(row["FORMAT"])
            for sample in vrec.samples.values())
        # NOTE: indexing above slows the generator expr by a factor of two.
        # indexing relies on the fixed ordering of FORMAT field values.
        batch.append(row)
    vf.close()  # FIXME:
    # from pprint import pprint
    # pprint(batch[-1])
    # populate as struct -> flatten
    batch = pa.array(batch, type=pa.struct(cols)).flatten()
    return pa.RecordBatch.from_arrays(batch, pa.schema(cols))
예제 #6
0
def decompose_multiallelic_record(in_vcf, out_vcf):
    """Break records with multiple ALT alleles into multiple records."""
    i_vcf = VariantFile(in_vcf, "r")
    raw_out = out_vcf.strip(".gz")
    o_vcf = VariantFile(raw_out, "w", header=i_vcf.header)

    for record in i_vcf:
        # Only mutect put multiple ALTs in one record
        number_events = len(record.alts)
        # Temporary fix due to segfault
        # see https://github.com/leukgen/click_mergevcfs/issues/2
        if number_events >= 8:
            continue
        elif number_events > 1:
            click.echo("file={},pos={}".format(in_vcf, record.pos))
            for i in range(0, number_events):
                new_rec = record.copy()
                new_rec.alts = tuple([record.alts[i]])
                # Multiallic sites GT are ex. 0/1/2, which causes error later
                # Needs to change to ./.
                genotypes = list(record.samples)
                for g in genotypes:
                    # Overwrite GT
                    new_rec.samples[g]["GT"] = (None, None)
                    # Use none_if_tuple_out_of_idx because
                    # record.samples[g]['AD'] would sometimes return
                    # a tuple of (None,)
                    if "AD" in list(record.samples[g]):
                        new_rec.samples[g]["AD"] = (
                            record.samples[g]["AD"][0],
                            none_if_tuple_out_of_idx(t=record.samples[g]["AD"],
                                                     index=i + 1),
                        )
                    if "AF" in list(record.samples[g]):
                        new_rec.samples[g]["AF"] = none_if_tuple_out_of_idx(
                            t=record.samples[g]["AF"], index=i)
                    if "F1R2" in list(record.samples[g]):
                        new_rec.samples[g]["F1R2"] = (
                            record.samples[g]["F1R2"][0],
                            none_if_tuple_out_of_idx(
                                t=record.samples[g]["F1R2"], index=i + 1),
                        )
                    if "F2R1" in list(record.samples[g]):
                        new_rec.samples[g]["F2R1"] = (
                            record.samples[g]["F2R1"][0],
                            none_if_tuple_out_of_idx(
                                t=record.samples[g]["F2R1"], index=i + 1),
                        )
                o_vcf.write(new_rec)
        else:
            o_vcf.write(record)

    o_vcf.close()
    subprocess.check_call(["bgzip", "-f", raw_out])
예제 #7
0
def filter_somatic(in_vcf_path, out_vcf_path):
    in_vcf = VariantFile(in_vcf_path)
    out_vcf = VariantFile(out_vcf_path, 'w', header=in_vcf.header)
    num_skipped_records = 0
    for rec in in_vcf:
        if is_somatic(rec):
            try:
                out_vcf.write(rec)
            except OSError:
                num_skipped_records += 1
    print("Skipped " + str(num_skipped_records) + " bad records")
    in_vcf.close()
    out_vcf.close()
예제 #8
0
 def filter_bcf_file(self,
                     bcf_file):
     bcf_in = VariantFile(bcf_file,'rb')
     bcf_out = VariantFile("%s.target.vcf" % bcf_file[:-4],'w',header=bcf_in.header)
     for rec in bcf_in.fetch():
         if rec.contig == self.contig_id:
             if self.contig_start == False and self.contig_end == False:
                 pass
             else:
                 if rec.pos >= self.contig_start and rec.pos <= self.contig_end:
                     bcf_out.write(rec)
     bcf_in.close()
     bcf_out.close()
예제 #9
0
def main():
    vcf = VariantFile(snakemake.input.vcf)
    outlier_table = pd.read_table(snakemake.input.outliers)
    filtered = VariantFile(snakemake.output[0], mode='w', header=vcf.header)

    outliers = defaultdict(list)
    for idx, row in outlier_table.iterrows():
        outliers[row['svtype']].append(row['sample'])

    for record in remove_outliers(vcf, outliers):
        filtered.write(record)

    filtered.close()
예제 #10
0
def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('--vcf', help='Input vcf', required=True)
    parser.add_argument(
        '--dict', help='Tab-delimited sample id conversion table', required=True)

    args = parser.parse_args()
    vcf = VariantFile(args.vcf)
    id_dict = get_id_dictionary(args.dict)
    new_ids = get_new_ids(vcf, id_dict)
    print_ids(new_ids)
    vcf.close()
예제 #11
0
def main():
    vcf_path = sys.argv[1]
    vcf = VariantFile(vcf_path, 'r')

    contigs = set()
    for record in vcf:
        contigs.add(record.chrom)

    vcf.close()
    vcf = VariantFile(vcf_path, 'r')
    for contig in sorted(contigs):
        vcf.header.add_line("##contig=<ID={}>".format(contig))

    print(vcf.header, end="")
    for record in vcf:
        print(record, end="")
예제 #12
0
def prepare_octopus_vcf_for_rtg(octopus_vcf, tumour_sample, out_vcf_name):
    """"
    Octopus reports non-diploid genotypes for somatic variants.
    """
    in_vcf = VariantFile(octopus_vcf)
    out_vcf = VariantFile(out_vcf_name, 'w', header=in_vcf.header)
    n_failed = 0
    for record in in_vcf:
        old_gt = record.samples[tumour_sample]['GT']
        assert (len(old_gt) > 1)
        somatic_allele = next(a for a in reversed(list(old_gt))
                              if a is not None and a > 0)
        record.samples[tumour_sample]['GT'] = (old_gt[0], somatic_allele)
        try:
            out_vcf.write(record)
        except OSError:
            n_failed += 1
    out_vcf.close()
    index(out_vcf_name)
def dtoxog_maf_to_vcf(input_maf: str, reference_fa: str,
                      output_vcf: str) -> None:
    """
    Transforms dToxoG MAF to minimal VCF of only dtoxo failures.

    :param input_maf: The annotated dtoxog MAF output file.
    :param reference_fa: Reference fasta used to make seqdict header.
    :param output_vcf: The output minimal VCF with only failed dtoxog records BGzip and tabix-index created if ends with '.gz'.
    """
    logger = Logger.get_logger("dtoxog_maf_to_vcf")
    logger.info("Transforms dToxoG MAF to minimal VCF of dtoxo failures")

    # setup
    total = 0
    written = 0
    tag = "oxog"

    # header
    header = generate_header(reference_fa, tag)

    # Writer
    mode = get_pysam_outmode(output_vcf)
    writer = VariantFile(output_vcf, mode=mode, header=header)

    # Process
    try:
        with open(input_maf, "rt") as fh:
            for record in maf_generator(fh):
                total += 1
                if record["oxoGCut"] == "1":
                    new_vcf_record = build_new_record(record, writer, tag)
                    writer.write(new_vcf_record)
                    written += 1

    finally:
        writer.close()

    if mode == "wz":
        logger.info("Creating tabix index...")
        tbx = tabix_index(output_vcf, preset="vcf", force=True)

    logger.info("Processed {} records - Wrote {}".format(total, written))
예제 #14
0
def add_PASSED_field(in_vcf, out_vcf):
    """
    Add PASSED_{caller} fields.

    Add flags (e.g. PASSED_caveman) under INFO for PASS variant in aim of reduce
    ambiguity of confident variants in the merged vcf.
    """
    # see logic of merging INFO fields
    # https://github.com/vcftools/vcftools/blob/490848f7865abbb4b436ca09381ea7912a363fe3/src/perl/vcf-merge
    caller = get_caller(in_vcf)

    i_vcf = VariantFile(in_vcf, "rb")
    new_header = i_vcf.header.copy()
    try:
        new_header.info.add(
            "PASSED_{}".format(caller),
            ".",
            "Flag",
            "this variants passed which caller(s)",
        )
        i_vcf.header.info.add(
            "PASSED_{}".format(caller),
            ".",
            "Flag",
            "this variants passed which caller(s)",
        )
    except ValueError:
        pass

    raw_out = out_vcf.strip(".gz")
    o_vcf = VariantFile(raw_out, "w", header=new_header)

    for record in i_vcf:
        new_rec = record.copy()
        filters = list(record.filter)
        if filters and filters[0] == "PASS":
            new_rec.info["PASSED_{}".format(caller)] = 1
        o_vcf.write(new_rec)

    o_vcf.close()

    subprocess.check_call(["bgzip", "-f", raw_out])
예제 #15
0
def main():
    parser = argparse.ArgumentParser("find_outliers.py")
    parser.add_argument("input", type=str, help="list of samples names")
    parser.add_argument("output", type=str, help="list of samples names")
    parser.add_argument("outliers", type=str, help="list of samples names")

    args = parser.parse_args()

    #vcf = VariantFile(snakemake.input.vcf)
    vcf = VariantFile(args.input)

    outlier_table = pd.read_table(args.outliers)
    filtered = VariantFile(args.output, mode='w', header=vcf.header)

    outliers = defaultdict(list)
    for idx, row in outlier_table.iterrows():
        outliers[row['svtype']].append(row['sample'])

    for record in remove_outliers(vcf, outliers):
        filtered.write(record)

    filtered.close()
예제 #16
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__doc__,
        prog='svtools standardize',
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('vcf', help='Raw VCF.')
    parser.add_argument('fout', help='Standardized VCF.')
    parser.add_argument('source',
                        help='Source algorithm. '
                        '[delly,lumpy,manta,wham,melt]')
    parser.add_argument('-p',
                        '--prefix',
                        help='If provided, variant names '
                        'will be overwritten with this prefix.')
    parser.add_argument('--include-reference-sites',
                        action='store_true',
                        default=False,
                        help='Include records where all '
                        'samples are called 0/0 or ./.')
    parser.add_argument('--standardizer',
                        help='Path to python file with '
                        'custom standardizer definition. (Not yet supported.)')

    # Print help if no arguments specified
    if len(argv) == 0:
        parser.print_help()
        sys.exit(1)
    args = parser.parse_args(argv)

    template = pkg_resources.resource_filename('svtools',
                                               'data/standard_template.vcf')
    template = VariantFile(template)
    vcf = VariantFile(args.vcf)

    # Template header includes all necessary FILTER, INFO, and FORMAT fields
    # Just need to add samples from VCF being standardized
    header = template.header
    for sample in vcf.header.samples:
        header.add_sample(sample)

    # Tag source in header
    meta = '##FORMAT=<ID={0},Number=1,Type=Integer,Description="Called by {1}"'
    meta = meta.format(args.source, args.source.capitalize())
    header.add_line(meta)
    header.add_line('##source={0}'.format(args.source))

    fout = VariantFile(args.fout, mode='w', header=header)

    standardizer = VCFStandardizer.create(args.source, vcf, fout)
    idx = 1
    for record in standardizer.standardize_vcf():
        if any_called(record) or args.include_reference_sites:
            if args.prefix is not None:
                record.id = '{0}_{1}'.format(args.prefix, idx)
                idx += 1

            fout.write(record)

    #  for std_rec in standardize_vcf(vcf, fout):
    #  fout.write(std_rec)

    fout.close()
    vcf.close()
예제 #17
0
파일: vcf.py 프로젝트: rvicedomini/whatshap
class VcfReader:
    """
    Read a VCF file chromosome by chromosome.
    """
    def __init__(
        self,
        path,
        indels=False,
        phases=False,
        genotype_likelihoods=False,
        ignore_genotypes=False,
        ploidy=None,
    ):
        """
        path -- Path to VCF file
        indels -- Whether to include also insertions and deletions in the list of
            variants.
        ignore_genotypes -- In case of genotyping algorithm, no genotypes may be given in
                                vcf, so ignore all genotypes
        ploidy -- Ploidy of the samples
        """
        # TODO Always include deletions since they can 'overlap' other variants
        self._indels = indels
        self._vcf_reader = VariantFile(path)
        self._path = path
        self._phases = phases
        self._genotype_likelihoods = genotype_likelihoods
        self._ignore_genotypes = ignore_genotypes
        self.samples = list(
            self._vcf_reader.header.samples)  # intentionally public
        self.ploidy = ploidy
        logger.debug("Found %d sample(s) in the VCF file.", len(self.samples))

    def __enter__(self):
        return self

    def __exit__(self, *args):
        # follows same structure as for ReadSetReader
        self.close()

    def close(self):
        self._vcf_reader.close()

    @property
    def path(self):
        return self._vcf_reader.filename.decode()

    def _fetch(self, chromosome: str, start=0, end=None):
        try:
            records = self._vcf_reader.fetch(chromosome, start=start, stop=end)
        except ValueError as e:
            if "invalid contig" in e.args[0]:
                raise VcfInvalidChromosome(e.args[0]) from None
            elif "fetch requires an index" in e.args[0]:
                raise VcfIndexMissing(
                    "{} is missing an index (.tbi or .csi)".format(
                        self._path)) from None
            else:
                raise
        return records

    def fetch(self, chromosome: str, start=0, end=None):
        """
        Fetch records from a single chromosome, optionally restricted to a single region.

        Return a VariantTable object.
        """
        records = list(self._fetch(chromosome, start=start, end=end))
        return self._process_single_chromosome(chromosome, records)

    def fetch_regions(self, chromosome: str, regions):
        """
        Fetch records from a single chromosome that overlap the given regions.

        :param regions: a list of start, end tuples (end can be None)
        """
        records = []
        for start, end in regions:
            records.extend(list(self._fetch(chromosome, start=start, end=end)))
        return self._process_single_chromosome(chromosome, records)

    def __iter__(self):
        """
        Yield VariantTable objects for each chromosome.

        Multi-ALT sites are skipped.
        """
        for chromosome, records in itertools.groupby(
                self._vcf_reader, lambda record: record.chrom):
            yield self._process_single_chromosome(chromosome, records)

    @staticmethod
    def _extract_HP_phase(call):
        hp = call.get("HP")
        if hp is None or hp == (".", ):
            return None
        fields = [[int(x) for x in s.split("-")] for s in hp]
        for i in range(len(fields)):
            assert fields[0][0] == fields[i][0]
        block_id = fields[0][0]
        phase = tuple(field[1] - 1 for field in fields)
        return VariantCallPhase(block_id=block_id,
                                phase=phase,
                                quality=call.get("PQ", None))

    @staticmethod
    def _extract_GT_PS_phase(call):
        is_het = not all(x == call["GT"][0] for x in call["GT"])
        if not is_het:
            return None
        if not call.phased:
            return None
        block_id = call.get("PS", 0)
        phase = call["GT"]
        return VariantCallPhase(block_id=block_id,
                                phase=phase,
                                quality=call.get("PQ", None))

    def _process_single_chromosome(self, chromosome, records):
        phase_detected = None
        n_snvs = 0
        n_other = 0
        n_multi = 0
        table = VariantTable(chromosome, self.samples)
        prev_position = None
        for record in records:
            if len(record.alts) > 1:
                # Multi-ALT sites are not supported, yet
                n_multi += 1
                continue

            pos, ref, alt = record.start, str(record.ref), str(record.alts[0])
            if len(ref) == len(alt) == 1:
                n_snvs += 1
            else:
                n_other += 1
                if not self._indels:
                    continue

            if (prev_position is not None) and (prev_position > pos):
                raise VcfNotSortedError(
                    "VCF not ordered: {}:{} appears before {}:{}".format(
                        chromosome, prev_position + 1, chromosome, pos + 1))

            if prev_position == pos:
                logger.warning(
                    "Skipping duplicated position %s on chromosome %r",
                    pos + 1,
                    chromosome,
                )
                continue
            prev_position = pos

            # Read phasing information (allow GT/PS or HP phase information, but not both),
            # if requested
            if self._phases:
                phases = []
                for sample_name, call in record.samples.items():
                    phase = None
                    for extract_phase, phase_name in [
                        (self._extract_HP_phase, "HP"),
                        (self._extract_GT_PS_phase, "GT_PS"),
                    ]:
                        p = extract_phase(call)
                        if p is not None:
                            if phase_detected is None:
                                phase_detected = phase_name
                            elif phase_detected != phase_name:
                                raise MixedPhasingError(
                                    "Mixed phasing information in input VCF (e.g. mixing PS "
                                    "and HP fields)")
                            phase = p
                            # check for ploidy consistency and limits
                            phase_ploidy = len(p.phase)
                            if phase_ploidy > get_max_genotype_ploidy():
                                raise PloidyError(
                                    "Ploidies higher than {} are not supported."
                                    "".format(get_max_genotype_ploidy()))
                            elif p is None or None in p:
                                pass
                            elif self.ploidy is None:
                                self.ploidy = phase_ploidy
                            elif phase_ploidy != self.ploidy:
                                print("phase= {}".format(phase))
                                raise PloidyError(
                                    "Phasing information contains inconsistent ploidy ({} and "
                                    "{})".format(self.ploidy, phase_ploidy))
                    phases.append(phase)
            else:
                phases = [None] * len(record.samples)

            # Read genotype likelihoods, if requested
            if self._genotype_likelihoods:
                genotype_likelihoods = []
                for call in record.samples.values():
                    GL = call.get("GL", None)
                    PL = call.get("PL", None)
                    # Prefer GLs (floats) over PLs (ints) if both should be present
                    if GL is not None:
                        genotype_likelihoods.append(GenotypeLikelihoods(GL))
                    elif PL is not None:
                        genotype_likelihoods.append(
                            GenotypeLikelihoods([pl / -10 for pl in PL]))
                    else:
                        genotype_likelihoods.append(None)
            else:
                genotype_likelihoods = [None] * len(record.samples)

            if not self._ignore_genotypes:
                # check for ploidy consistency and limits
                genotype_lists = [
                    call["GT"] for call in record.samples.values()
                ]
                for geno in genotype_lists:
                    geno_ploidy = len(geno)
                    if geno_ploidy > get_max_genotype_ploidy():
                        raise PloidyError(
                            "Ploidies higher than {} are not supported."
                            "".format(get_max_genotype_ploidy()))
                    elif geno is None or None in geno:
                        pass
                    elif self.ploidy is None:
                        self.ploidy = geno_ploidy
                    elif geno_ploidy != self.ploidy:
                        raise PloidyError("Inconsistent ploidy ({} and "
                                          "{})".format(self.ploidy,
                                                       geno_ploidy))

                genotypes = [
                    genotype_code(geno_list) for geno_list in genotype_lists
                ]
            else:
                genotypes = [Genotype([]) for i in range(len(self.samples))]
                phases = [None] * len(self.samples)
            variant = VcfVariant(position=pos,
                                 reference_allele=ref,
                                 alternative_allele=alt)
            table.add_variant(variant, genotypes, phases, genotype_likelihoods)

        logger.debug(
            "Parsed %s SNVs and %s non-SNVs. Also skipped %s multi-ALTs.",
            n_snvs,
            n_other,
            n_multi,
        )

        # TODO remove overlapping variants
        return table
예제 #18
0
#coding:utf-8
from sys import argv
from os.path import exists
import os
import pysam
import numpy as np
from pysam import VariantFile

script, bam_file, vcf_file, output_bam_file = argv

bamfile = pysam.AlignmentFile(bam_file, "rb")
vcffile = VariantFile(vcf_file)
output_bamfile = pysam.AlignmentFile(output_bam_file, "wb", template=bamfile)
for rec in vcffile.fetch():
    for read in bamfile.fetch():
        if (rec.pos == read.pos):
            output_bamfile.write(read)

output_bamfile.close()
bamfile.close()
vcffile.close()
예제 #19
0
    dest='call_vcf',
    help='Called vcf to search for variants not found in reference vcf')
parser.add_argument(
    '-o',
    '--out-vcf',
    action='store',
    dest='out_vcf',
    help='Output vcf that is a subset of called vcf meeting criteria')

args = parser.parse_args()

ref_vcf = VariantFile(args.ref_vcf)
called_vcf = VariantFile(args.call_vcf, threads=4)
out_vcf = VariantFile(args.out_vcf, "w", header=called_vcf.header, threads=4)
x = 0
m = 1000
for record in called_vcf.fetch():
    if x % m == 0:
        sys.stderr.write('Processed ' + str(x) + " records\n")
        sys.stderr.flush()
    f = 0
    for comp in ref_vcf.fetch(record.contig, record.start, record.stop):
        if record.pos == comp.pos and record.alleles == comp.alleles:
            f = 1
            break
    if not f:
        out_vcf.write(record)
    x += 1
out_vcf.close()
ref_vcf.close()
called_vcf.close()
예제 #20
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__doc__,
        prog='svtools vcfcluster',
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('filelist',
                        type=argparse.FileType('r'),
                        help='List of paths to standardized VCFS')
    parser.add_argument('fout', help='Clustered VCF.')
    parser.add_argument('source',
                        help='Source algorithm. '
                        '[delly,lumpy,manta,wham,melt]')
    parser.add_argument('-r',
                        '--region',
                        default=None,
                        help='Restrict clustering to genomic region.')
    parser.add_argument('-d',
                        '--dist',
                        type=int,
                        default=500,
                        help='Maximum clustering distance. Suggested to use '
                        'max of median + 7*MAD over samples. [500]')
    parser.add_argument('-f',
                        '--frac',
                        type=float,
                        default=0.1,
                        help='Minimum reciprocal overlap between variants. '
                        '[0.1]')
    parser.add_argument('-x',
                        '--blacklist',
                        metavar='BED.GZ',
                        type=TabixFile,
                        default=None,
                        help='Tabix indexed bed of blacklisted regions. Any '
                        'SV with a breakpoint falling inside one of these '
                        'regions is filtered from output.')
    parser.add_argument('-z',
                        '--svsize',
                        type=int,
                        default=500,
                        help='Minimum SV size to report for intrachromosomal '
                        'events. [0]')
    parser.add_argument('-p',
                        '--prefix',
                        default='MERGED',
                        help='Prefix for merged variant IDs. [MERGED]')
    parser.add_argument('-t',
                        '--svtypes',
                        default='DEL,DUP,INV,BND',
                        help='Comma delimited list of svtypes to cluster '
                        '[DEL,DUP,INV,BND]')
    parser.add_argument('--preserve-ids',
                        action='store_true',
                        default=False,
                        help='Include list of IDs of constituent records in '
                        'each cluster.')
    #  parser.add_argument('--cluster-bed', type=argparse.FileType('w'),
    #                      help='Bed of constituent calls in each cluster')

    # Print help if no arguments specified
    if len(argv) == 0:
        parser.print_help()
        sys.exit(1)
    args = parser.parse_args(argv)

    # Parse SV files and lists of samples and sources
    filepaths = [line.strip() for line in args.filelist.readlines()]
    vcfs = parse_filepaths(filepaths)

    svtypes = args.svtypes.split(',')

    svc = VCFCluster(vcfs,
                     dist=args.dist,
                     blacklist=args.blacklist,
                     frac=args.frac,
                     svtypes=svtypes,
                     region=args.region,
                     preserve_ids=args.preserve_ids)

    # Open new file
    if args.fout in '- stdout'.split():
        fout = sys.stdout
    else:
        fout = open(args.fout, 'w')

    fout = VariantFile(fout, mode='w', header=svc.header)

    for i, record in enumerate(svc.cluster()):
        # Name record
        if args.prefix:
            name = [args.prefix]
        else:
            name = ['SV']
        name.append(args.source)
        if args.region:
            chrom = args.region.split(':')[0]
            name.append(chrom)
        name.append(str(i + 1))
        record.id = '_'.join(name)

        fout.write(record)

        # Size filter (CTX have size -1)
        if -1 < record.info['SVLEN'] < args.svsize:
            continue

        #  if args.cluster_bed is not None:
        #  flatten_pos(cluster, record.ID, args.cluster_bed)

    fout.close()
예제 #21
0
            if genotype[0] == genotype[2]:
                if genotype[0] == '0':
                    REF_HOMO += 1
                else:
                    ALT_HOMO += 1
            else:
                HET += 1

        sys.stdout.write('%s\t%d\t%d\t%d\t%d\n' %
                         ('\t'.join(out), REF_HOMO, HET, ALT_HOMO, MISS))

    # Start file reading from here.
    infile = VariantFile('-', 'r')
    #sys.stdout.write(str(infile.header))
    for line in infile:
        ss = str(line).strip().split()
        setoutGenoArrayIndex(ss[8])

        if OUT_FORMAT == 'ALT_FRE':
            outputAlleleFrequency(ss)
        elif OUT_FORMAT == 'GP_GENO':
            outputGPGenotype(ss)
        elif OUT_FORMAT == 'GT_GENO':
            outputGTGenotype(ss)

    infile.close()
sys.stdout.flush()
sys.stdout.close()
sys.stderr.flush()
sys.stderr.close()
예제 #22
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__doc__,
        prog='svtk standardize',
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('vcf', help='Raw VCF.')
    parser.add_argument('fout', help='Standardized VCF.')
    parser.add_argument('source',
                        help='Source algorithm. '
                        '[delly,lumpy,manta,wham,melt]')
    parser.add_argument('-p',
                        '--prefix',
                        help='If provided, variant names '
                        'will be overwritten with this prefix.')
    parser.add_argument('--include-reference-sites',
                        action='store_true',
                        default=False,
                        help='Include records where all '
                        'samples are called 0/0 or ./.')
    parser.add_argument('--standardizer',
                        help='Path to python file with '
                        'custom standardizer definition. (Not yet supported.)')
    parser.add_argument('--contigs',
                        type=argparse.FileType('r'),
                        help='Reference fasta index (.fai). If provided, '
                        'contigs in index will be used in VCF header. '
                        'Otherwise all GRCh37 contigs will be used in header. '
                        'Variants on contigs not in provided list will be '
                        'removed.')
    parser.add_argument('--min-size',
                        type=int,
                        default=50,
                        help='Minimum SV size to report [50].')
    parser.add_argument('--call-null-sites',
                        action='store_true',
                        default=False,
                        help='Call sites with null genotypes (./.). Generally '
                        'useful when an algorithm has been run on a single '
                        'sample and has only reported variant sites.')
    parser.add_argument('--sample-names',
                        type=str,
                        default=None,
                        help='Comma-delimited list of sample names to use in '
                        'header [use existing].')

    # Print help if no arguments specified
    if len(argv) == 0:
        parser.print_help()
        sys.exit(1)
    args = parser.parse_args(argv)

    # Add contigs to header if provided
    if args.contigs:
        template = pkg_resources.resource_filename(
            'svtk', 'data/no_contigs_template.vcf')
        template = VariantFile(template)
        header = template.header
        contig_line = '##contig=<ID={contig},length={length}>'
        for line in args.contigs:
            contig, length = line.split()[:2]
            header.add_line(contig_line.format(**locals()))
    # Use GRCh37 by default
    else:
        template = pkg_resources.resource_filename('svtk',
                                                   'data/GRCh37_template.vcf')
        template = VariantFile(template)
        header = template.header

    vcf = VariantFile(args.vcf)

    # Parse new sample names if provided
    if args.sample_names:
        sample_names_list = args.sample_names.split(',')
    else:
        sample_names_list = vcf.header.samples

    # Tag source in header
    meta = '##FORMAT=<ID={0},Number=1,Type=Integer,Description="Called by {1}"'
    meta = meta.format(args.source, args.source.capitalize())
    header.add_line(meta)
    header.add_line('##source={0}'.format(args.source))

    fout = VariantFile(args.fout, mode='w', header=header)

    standardizer = VCFStandardizer.create(args.source, vcf, fout,
                                          sample_names_list, args.prefix,
                                          args.min_size,
                                          args.include_reference_sites,
                                          args.call_null_sites)

    for record in standardizer.standardize_vcf():
        fout.write(record)

    fout.close()
    vcf.close()
    # Add annotation to cols and write to file
    ###
    cols.append(str(rsid))
    cols.append(str(allele))
    cols.append(str(gene))
    cols.append(str(annotation))
    cols.append(str(hgvs_c))
    cols.append(str(hgvs_p))
    for info in infos_out:
        if type(info) is tuple or type(info) is list:
            info_str = [str(s) for s in info]
            cols.append(",".join(info_str))
        else:
            cols.append(str(info))

    output_handle.write("\t".join(cols))
    output_handle.write("\n")

######
# Clean up
######
if input_handle is not None:
    input_handle.close()

if output_handle is not None:
    output_handle.close()

if vcf_handle is not None:
    vcf_handle.close()

print "Complete!"
예제 #24
0
    def setoutGenoArrayIndex(oldFormatTags):
        outGenoArrayIndex.clear()
        ss = oldFormatTags.upper().split(':')
        for x in tags:
            try:
                y = ss.index(x)
                outGenoArrayIndex.append(y)
            except ValueError:
                sys.stderr.write('ERROR: can not find tag: "%s", from input vcf FORMAT field.\n'%(x))
                sys.exit(-1)

    infile = VariantFile('-', 'r')
    sys.stdout.write(str(infile.header))
    for line in infile:
        ss = str(line).strip().split()
        out = ss[:vcfMetaCols]
        out[8] = otags                  #update tags genotyp tags info.
        setoutGenoArrayIndex(ss[8])     #Check format line by line.
        for x in ss[vcfMetaCols:]:
            #if not outGenoArrayIndex:
            #    setoutGenoArrayIndex(ss[8])
            out.append(reformat(x))

        sys.stdout.write('%s\n'%('\t'.join(out)))

    infile.close()
sys.stdout.flush()
sys.stdout.close()
sys.stderr.flush()
sys.stderr.close()
예제 #25
0
    all_coords = set(first_coords + second_coords)
    all_coords = sorted(list(all_coords))
    #print(all_coords)

    # main loop
    for site in all_coords:
        match_xlist = [rec for rec in vcf_primary.fetch() if rec.pos==site]
        match_ylist = [rec for rec in vcf_secondary.fetch() if rec.pos==site]

        if len(match_xlist) == 0: # no match
            recx_coverage = -1 # any positive number is larger than this
        else:
            assert len(match_xlist) == 1
            recx = match_xlist[0]
            recx_coverage = recx.info["DP"]

        if len(match_ylist) == 0: # no match
            recy_coverage = -1 # any positive number is larger than this
        else:
            assert len(match_ylist) == 1
            recy = match_ylist[0]
            recy_coverage = recy.info["DP"]

        if recy_coverage > recx_coverage:
            vcf_out.write(recy)
        else:
            vcf_out.write(recx)
    vcf_primary.close()
    vcf_secondary.close()
    vcf_out.close()
예제 #26
0
def read_vcf(fh, alleles, slh=None):
    vcf_in = VariantFile(fh)
    sample = list(vcf_in.header.samples)[0]
    availcols = next(vcf_in.fetch()).format.keys()
    vcf_in.seek(0)

    # Check if sample size info is in header
    global_fields = [x for x in vcf_in.header.records if x.key == "SAMPLE"][0]
    if alleles:
        dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str}
        usecols = list(dtype_dict.keys())

        # Read in data
        if 'SS' in availcols:
            o = [[
                rec.id,
                rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0],
                rec.samples[sample]['SS'][0], rec.alts[0], rec.ref
            ] for rec in vcf_in.fetch()]
            N = pd.Series([x[2] for x in o], dtype='float')
        else:
            o = [[
                rec.id,
                rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0],
                rec.alts[0], rec.ref
            ] for rec in vcf_in.fetch()]
            if 'TotalControls' in global_fields.keys(
            ) and 'TotalCases' in global_fields.keys():
                N = pd.Series([
                    float(global_fields['TotalControls']) +
                    float(global_fields['TotalCases'])
                ] * len(o),
                              dtype='float')
            elif 'TotalControls' in global_fields.keys():
                N = pd.Series([float(global_fields['TotalControls'])] * len(o),
                              dtype='float')
            else:
                N = pd.Series([np.NaN] * len(o), dtype='float')

        p = pd.DataFrame({
            'SNP':
            pd.Series([x[0] for x in o], dtype='str'),
            'Z':
            pd.Series([x[1] for x in o], dtype='float'),
            'N':
            N,
            'A1':
            pd.Series([x[2 + int('SS' in availcols)] for x in o], dtype='str'),
            'A2':
            pd.Series([x[3 + int('SS' in availcols)] for x in o], dtype='str')
        })
    else:
        dtype_dict = {'SNP': str, 'Z': float, 'N': float}
        usecols = list(dtype_dict.keys())
        if 'SS' in availcols:
            o = [[
                rec.id,
                rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0],
                rec.samples[sample]['SS'][0]
            ] for rec in vcf_in.fetch()]
            N = pd.Series([x[2] for x in o], dtype='float')
        else:
            o = [[
                rec.id,
                rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0]
            ] for rec in vcf_in.fetch()]
            if 'TotalControls' in global_fields.keys(
            ) and 'TotalCases' in global_fields.keys():
                N = pd.Series([
                    float(global_fields['TotalControls']) +
                    float(global_fields['TotalCases'])
                ] * len(o),
                              dtype='float')
            elif 'TotalControls' in global_fields.keys():
                N = pd.Series([float(global_fields['TotalControls'])] * len(o),
                              dtype='float')
            else:
                N = pd.Series([np.NaN] * len(o), dtype='float')

        p = pd.DataFrame({
            'SNP': pd.Series([x[0] for x in o], dtype='str'),
            'Z': pd.Series([x[1] for x in o], dtype='float'),
            'N': N
        })

    vcf_in.close()

    if slh is not None:
        compression = get_compression(slh)
        sl = []
        if compression == "gzip":
            try:
                with gzip.open(slh) as f:
                    for line in f:
                        sl.append(line.strip())
            except (AttributeError, ValueError) as e:
                raise ValueError('Improperly formatted snplist file: ' +
                                 str(e.args))
        else:
            try:
                with open(slh) as f:
                    for line in f:
                        sl.append(line.strip())
            except (AttributeError, ValueError) as e:
                raise ValueError('Improperly formatted snplist file: ' +
                                 str(e.args))
        f.close()
        p = p.loc[p['SNP'].isin(sl)]

    return (p)
예제 #27
0
 def write_vcf(self, path):
     vcf = VariantFile(path, 'w', header=self.header)
     for variant in self.filtered_variants:
         vcf.write(variant.pysam_rec)
     vcf.close()
def main():
    args = process_input()

    chrom_vcf = args.chrom_vcf
    min_r2 = args.min_r2
    min_maf = args.min_maf
    out_prefix = args.out_prefix
    r2_field_name = args.r2_field_name
    maf_field_name = args.maf_field_name
    new_ids = args.new_ids

    ####
    # Read new ids in dictionary
    ####

    new_ids_dict = dict()
    if new_ids is not None:
        with open(new_ids, "r") as f:
            for line in f:
                old_id, new_id = line.rstrip().split("\t")
                new_ids_dict[old_id] = new_id
        print "Ids {0} ids to remap".format(len(new_ids_dict))

    out_vcf_list = "{0}.vcf_list.tsv".format(out_prefix)
    out_vcf_list_handle = open(out_vcf_list, "w")

    for chrom, vcf in chrom_vcf.iteritems():
        chrom_match = re.match("(chr)?(.+)", chrom)
        if chrom_match is not None:
            chrom = chrom_match.group(2)
        else:
            raise ValueError(
                "Chomosome name {0} not formatted correctly!".format(chrom))

        out_vcf_name = "{0}.chr{1}.vcf".format(out_prefix, chrom)
        out_vcf_name_gz = "{0}.chr{1}.vcf.gz".format(out_prefix, chrom)
        out_vcf_name_gz_tbi = "{0}.chr{1}.vcf.gz.tbi".format(out_prefix, chrom)

        print "Processing chr{0} {1}...".format(chrom, vcf)
        in_vcf_handle = VariantFile(vcf)
        pass_filter = in_vcf_handle.header.filters["PASS"]

        out_vcf_list_handle.write("{0}\t{1}".format(chrom, out_vcf_name_gz))
        out_vcf_list_handle.write("\n")

        ####
        # It appears that writing to a BCF is the only method that works in this version of pysam
        ####

        #'wb' for BCF
        #
        #out_vcf_handle = VariantFile(out_vcf_name,'wb',header=in_vcf_handle.header)
        #out_vcf_handle = pysam.libcbgzf.BGZFile(out_vcf_name,"wb")
        #out_vcf_handle.write(str(in_vcf_handle.header))

        #cmd = "bgzip -c > {0}".format(out_vcf_name)
        #print cmd

        out_vcf_handle = open(out_vcf_name, "w")

        print "Relabeling and writing header..."
        relabeled_ids = 0
        old_header_lines = str(in_vcf_handle.header).split("\n")
        for line in old_header_lines:

            if line == "":
                continue

            if re.match("^#CHROM.+", line):
                cols = line.split("\t")
                for i in range(9, len(cols)):
                    if cols[i] in new_ids_dict:
                        relabeled_ids += 1
                        cols[i] = new_ids_dict[cols[i]]
                #merge new columns
                new_line = "\t".join(cols)
                out_vcf_handle.write(new_line)
            else:
                out_vcf_handle.write(line)

            #write new line
            out_vcf_handle.write("\n")

        print "Relabeled {0} ids".format(relabeled_ids)

        rec_count = 0

        for rec in in_vcf_handle:
            rec_count += 1
            if rec_count % 50000 == 0:
                print "Line: {0:d} {1}:{2:d}".format(rec_count, rec.chrom,
                                                     rec.pos)
            r2 = rec.info[r2_field_name]
            maf = rec.info[maf_field_name]
            if r2 > min_r2 and maf > min_maf:
                #clear filters
                rec.filter.clear()
                #set filter to be pass
                rec.filter.add("PASS")
                #new lines are already there
                out_vcf_handle.write(str(rec))

        #print "Running bgzip on "
        ##execute bgzip
        #bgz_handle = Popen(["bgzip", out_vcf_name])
        #bgz_handle.wait()

        in_vcf_handle.close()
        out_vcf_handle.close()

        print "Writing tabix index for {0}...".format(out_vcf_name,
                                                      preset="vcf")
        #seems to only compress files
        pysam.tabix_index(out_vcf_name, preset="vcf")

        if not os.path.isfile(out_vcf_name_gz_tbi):
            pysam.tabix_index(out_vcf_name_gz, preset="vcf")

        if os.path.isfile(out_vcf_name):
            os.remove(out_vcf_name)

    out_vcf_list_handle.close()
    print "Finished writing {0}".format(out_vcf_list)
    print "Complete!"
        windowsizes += [startCI, endCI]
        windowsizes_by_caller[caller[0]]["CI_sizes"]["All"] += [startCI, endCI]
        windowsizes_by_caller[caller[0]]["CI_sizes"]["Start"][
            svrec.svtype] += [startCI]
        windowsizes_by_caller[caller[0]]["CI_sizes"]["End"][svrec.svtype] += [
            endCI
        ]
        windowsizes_by_SVType[svrec.svtype] += [startCI, endCI]
        if startCI > 200 or endCI > 200:
            lost_SVs += 1
            windowsizes_by_caller[caller[0]]["Lost_SVs"] += 1
            SVCount_bytype[svrec.svtype + "_lost"] += 1
        elif svrec.svtype == "DEL":
            print(svrec.chrom + ":" + str(svrec.start) + "-" + str(svrec.end))

    vcf_in.close()

fraction_lostSVs_allcallers = round(lost_SVs / total_SVs, 4)

bins = list(range(0, max(windowsizes), 50))

plt.hist(windowsizes,
         bins=bins,
         log=True,
         edgecolor='black',
         linewidth=0.5,
         zorder=3,
         color="seagreen")
plt.xlabel('Breakpoint interval sizes [bp]')
plt.ylabel('Counts [log]')
plt.title(
예제 #30
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__doc__,
        prog='svtk vcfcluster',
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('filelist',
                        type=argparse.FileType('r'),
                        help='List of paths to standardized VCFS')
    parser.add_argument('fout', help='Clustered VCF.')
    parser.add_argument('-r',
                        '--region',
                        default=None,
                        help='Restrict clustering to genomic region.')
    parser.add_argument('-d',
                        '--dist',
                        type=int,
                        default=500,
                        help='Maximum clustering distance. Suggested to use '
                        'max of median + 7*MAD over samples. [500]')
    parser.add_argument('-f',
                        '--frac',
                        type=float,
                        default=0.1,
                        help='Minimum reciprocal overlap between variants. '
                        '[0.1]')
    parser.add_argument('-x',
                        '--blacklist',
                        metavar='BED.GZ',
                        type=TabixFile,
                        default=None,
                        help='Tabix indexed bed of blacklisted regions. Any '
                        'SV with a breakpoint falling inside one of these '
                        'regions is filtered from output.')
    parser.add_argument('-z',
                        '--svsize',
                        type=int,
                        default=500,
                        help='Minimum SV size to report for intrachromosomal '
                        'events. [0]')
    parser.add_argument('-p',
                        '--prefix',
                        default='MERGED',
                        help='Prefix for merged variant IDs. [MERGED]')
    parser.add_argument('-t',
                        '--svtypes',
                        default='DEL,DUP,INV,BND',
                        help='Comma delimited list of svtypes to cluster '
                        '[DEL,DUP,INV,BND]')
    parser.add_argument('--ignore-svtypes',
                        action='store_true',
                        default=False,
                        help='Ignore svtypes when clustering.')
    parser.add_argument('-o',
                        '--sample-overlap',
                        type=float,
                        default=0.0,
                        help='Minimum sample overlap for two variants to be '
                        'clustered together.')
    parser.add_argument('--preserve-ids',
                        action='store_true',
                        default=False,
                        help='Include list of IDs of constituent records in '
                        'each cluster.')
    parser.add_argument('--preserve-genotypes',
                        action='store_true',
                        default=False,
                        help='In a set of clustered variants, report best '
                        '(highest GQ) non-reference genotype when available.')
    parser.add_argument('--preserve-header',
                        action='store_true',
                        default=False,
                        help='Use header from clustering VCFs')
    parser.add_argument(
        '--skip-merge',
        action='store_true',
        default=False,
        help='Do not merge clustered records. Adds CLUSTER info fields.')
    parser.add_argument(
        '--merge-only',
        action='store_true',
        default=False,
        help=
        'When run on a vcf generated with --skip-merge, only merges records '
        'with identical CLUSTER fields.')
    parser.add_argument(
        '--single-end',
        action='store_true',
        default=False,
        help='Require only one end to be within the minimum distance.')
    #  parser.add_argument('--cluster-bed', type=argparse.FileType('w'),
    #                      help='Bed of constituent calls in each cluster')

    # Print help if no arguments specified
    if len(argv) == 0:
        parser.print_help()
        sys.exit(1)
    args = parser.parse_args(argv)

    if args.skip_merge and args.merge_only:
        raise ValueError('Cannot use both --skip-merge and --merge-only')

    # Parse SV files and lists of samples and sources
    filepaths = [line.strip() for line in args.filelist.readlines()]
    vcfs = parse_filepaths(filepaths)

    svtypes = args.svtypes.split(',')
    match_svtypes = not args.ignore_svtypes

    do_merge = not args.skip_merge
    do_cluster = not args.merge_only
    svc = VCFCluster(vcfs,
                     dist=args.dist,
                     blacklist=args.blacklist,
                     frac=args.frac,
                     svtypes=svtypes,
                     region=args.region,
                     match_svtypes=match_svtypes,
                     preserve_ids=args.preserve_ids,
                     preserve_genotypes=args.preserve_genotypes,
                     sample_overlap=args.sample_overlap,
                     preserve_header=args.preserve_header,
                     do_cluster=do_cluster,
                     do_merge=do_merge,
                     single_end=args.single_end)

    # Open new file
    if args.fout in '- stdout'.split():
        fout = sys.stdout
    else:
        fout = open(args.fout, 'w')

    fout = VariantFile(fout, mode='w', header=svc.header)

    for i, cluster in enumerate(svc.cluster()):
        if args.prefix:
            cluster_id = [args.prefix]
        else:
            cluster_id = ['SV']
        if args.region:
            chrom = args.region.split(':')[0]
            cluster_id.append(chrom)
        if do_merge and do_cluster:
            cluster_index = i
        else:
            cluster_index = cluster[0].info['CLUSTER']
        cluster_id.append(str(cluster_index + 1))
        cluster_id = '_'.join(cluster_id)

        for record in cluster:
            # Name record
            if do_merge:
                name = cluster_id
            else:
                name = record.id

            record.id = name
            fout.write(record)

            # Size filter (CTX have size -1)
            if -1 < record.info['SVLEN'] < args.svsize:
                continue

            #  if args.cluster_bed is not None:
            #  flatten_pos(cluster, record.ID, args.cluster_bed)

    fout.close()
예제 #31
0
        out_vcf.close()
    except Exception as e:
        sys.stderr.write(str(e) + "\n failed to process " + cpath + "\n")


if len(sys.argv) == 1:
    parser.print_help()
    sys.exit(1)

args = parser.parse_args()
# pdb.set_trace()

tbl_dict = {}
cat_dict = {'info': 'INFO', 'formats': 'FORMAT'}
for line in open(args.table):
    (cat, key) = line.rstrip('\n').split('\t')
    if cat not in tbl_dict:
        tbl_dict[cat] = []
    tbl_dict[cat].append(key)

good_boy = VariantFile(args.ex_vcf)
file_process(args.in_vcf)
# with open(args.in_vcf) as f:
#     vcf_list = f.read().splitlines()
#     if len(vcf_list[-1]) < 5:
#         vcf_list.pop()

# with concurrent.futures.ThreadPoolExecutor(32) as executor:
#     results = {executor.submit(mt_file_process, fpath): fpath for fpath in vcf_list}
good_boy.close()