Exemplo n.º 1
0
def build_exclude_positions(exclude_file):
    """Build dict of positions to be excluded"""
    positions = set()
    if exclude_file is None:
        return positions
    for line in exclude_file:
        cols = line.split("\t")
        chrom, pos = cols[0:2]
        pos_id = create_pos_id(chrom, pos)
        positions.add(pos_id)
    return positions
Exemplo n.º 2
0
def build_exclude_positions(exclude_file):
    """Build dict of positions to be excluded"""
    positions = set()
    if exclude_file is None:
        return positions
    for line in exclude_file:
        cols = line.split("\t")
        chrom, pos = cols[0:2]
        pos_id = create_pos_id(chrom, pos)
        positions.add(pos_id)
    return positions
Exemplo n.º 3
0
def main():
    """Main program"""

    # Argument parsing
    args = parse_args()

    # Setup
    vcf_reader = vcf.Reader(args.input_vcf)
    vep_cols = parse_vep_cols(vcf_reader)

    # Create set of genes to be excluded
    excl_genes_set = build_exclude_genes(args.exclude_genes)

    # Create set of positions to be excluded
    excl_pos_set = build_exclude_positions(args.exclude_positions)

    # Build dict of genes with affected samples
    # Sets: num_samples, num_samples_mod_impact, num_samples_high_impact
    SampleSets = namedtuple("SampleSets", ["all", "moderate", "high"])
    genes = defaultdict(lambda: SampleSets(set(), set(), set()))

    # Iterate over VCF file
    for record in vcf_reader:
        # Filter on position, if applicable
        pos_id = create_pos_id(record.CHROM, record.POS)
        if pos_id in excl_pos_set:
            continue
        # Filter on NUM_SAMPLES
        if args.max_samples and record.INFO["NUM_SAMPLES"] > args.max_samples:
            continue
        # Parse VEP output and select the first and only one
        vep_effect = parse_vep(vep_cols, record, tag="TOP_CSQ")[0]
        # Skip if symbol is absent
        if args.symbol and vep_effect["SYMBOL"] == "":
            continue
        # Exclude on gene ID or symbol
        if vep_effect["Gene"] in excl_genes_set or vep_effect[
                "SYMBOL"] in excl_genes_set:
            continue
        # Extract gene ID and symbol
        gid, gsymbol = vep_effect["Gene"], vep_effect["SYMBOL"]
        # Extract calls with minimum depth
        calls = []
        for call in record.samples:
            depth = getattr(call.data, "DP", 0)
            allele_depths = getattr(call.data, "AD", (0, 0))
            if (call.gt_type != 0 and depth >= args.min_depth
                    and allele_depths[1] /
                (allele_depths[0] + allele_depths[1]) <
                    args.homo_vaf_threshold):
                calls.append(call)
        # Extract samples
        samples = set(c.sample for c in calls)
        # Add samples to genes dict; using gid and gsymbol for readability
        genes[(gid, gsymbol)].all.update(samples)
        # Update sample lists based on variant type
        if any([eff in vep_effect["Consequence"] for eff in HIGH_IMPACT]):
            genes[(gid, gsymbol)].high.update(samples)
        elif any([eff in vep_effect["Consequence"]
                  for eff in MODERATE_IMPACT]):
            genes[(gid, gsymbol)].moderate.update(samples)

    # Order genes by number of affected samples
    genes_list = [(gene[0], gene[1], len(sets[0]), len(sets[1]), len(sets[2]))
                  for gene, sets in genes.items()]
    genes_list.sort(key=lambda x: x[2], reverse=True)

    # Output sorted gene list
    header = "\t".join([
        "gene_id", "gene_symbol", "num_samples",
        "num_samples_with_moderate_effect", "num_samples_with_high_effect"
    ]) + "\n"
    args.output.write(header)
    for gene in genes_list:
        line = "\t".join(map(str, gene)) + "\n"
        args.output.write(line)

    # Cleanup
    args.output.close()
Exemplo n.º 4
0
def main():
    """Main program"""

    # Argument parsing
    args = parse_args()

    # Setup
    vcf_reader = vcf.Reader(args.input_vcf)
    vep_cols = parse_vep_cols(vcf_reader)

    # Create set of genes to be excluded
    excl_genes_set = build_exclude_genes(args.exclude_genes)

    # Create set of positions to be excluded
    excl_pos_set = build_exclude_positions(args.exclude_positions)

    # Build dict of genes with affected samples
    # Sets: num_samples, num_samples_mod_impact, num_samples_high_impact
    SampleSets = namedtuple("SampleSets", ["all", "moderate", "high"])
    genes = defaultdict(lambda: SampleSets(set(), set(), set()))

    # Iterate over VCF file
    for record in vcf_reader:
        # Filter on position, if applicable
        pos_id = create_pos_id(record.CHROM, record.POS)
        if pos_id in excl_pos_set:
            continue
        # Filter on NUM_SAMPLES
        if args.max_samples and record.INFO["NUM_SAMPLES"] > args.max_samples:
            continue
        # Parse VEP output and select the first and only one
        vep_effect = parse_vep(vep_cols, record, tag="TOP_CSQ")[0]
        # Skip if symbol is absent
        if args.symbol and vep_effect["SYMBOL"] == "":
            continue
        # Exclude on gene ID or symbol
        if vep_effect["Gene"] in excl_genes_set or vep_effect["SYMBOL"] in excl_genes_set:
            continue
        # Extract gene ID and symbol
        gid, gsymbol = vep_effect["Gene"], vep_effect["SYMBOL"]
        # Extract calls with minimum depth
        calls = []
        for call in record.samples:
            depth = getattr(call.data, "DP", 0)
            allele_depths = getattr(call.data, "AD", (0, 0))
            if (call.gt_type != 0 and depth >= args.min_depth and allele_depths[1] / (allele_depths[0] + allele_depths[1]) < args.homo_vaf_threshold):
                calls.append(call)
        # Extract samples
        samples = set(c.sample for c in calls)
        # Add samples to genes dict; using gid and gsymbol for readability
        genes[(gid, gsymbol)].all.update(samples)
        # Update sample lists based on variant type
        if any([eff in vep_effect["Consequence"] for eff in HIGH_IMPACT]):
            genes[(gid, gsymbol)].high.update(samples)
        elif any([eff in vep_effect["Consequence"] for eff in MODERATE_IMPACT]):
            genes[(gid, gsymbol)].moderate.update(samples)

    # Order genes by number of affected samples
    genes_list = [(gene[0], gene[1], len(sets[0]), len(sets[1]), len(sets[2])) for gene, sets in genes.items()]
    genes_list.sort(key=lambda x: x[2], reverse=True)

    # Output sorted gene list
    header = "\t".join(["gene_id", "gene_symbol", "num_samples", "num_samples_with_moderate_effect", "num_samples_with_high_effect"]) + "\n"
    args.output.write(header)
    for gene in genes_list:
        line = "\t".join(map(str, gene)) + "\n"
        args.output.write(line)

    # Cleanup
    args.output.close()