예제 #1
0
파일: __init__.py 프로젝트: biowdl/biotdg
def vcf_to_mutations(vcf_path: str, sample: str
                     ) -> Dict[str, Dict[int, List[Mutation]]]:
    """
    Reads a VCF and translates it to a Dict[contig, Dict[chromosome_number,
    List[Mutation]]]. Where contig can be "chr1" for example. Chromosome_number
    can be any number of 0 or higher depending on the ploidy. If the ploidy is
    2 there will be chromosome numbers 0 and 1.
    :param vcf_path: String that represents the path to the VCF
    :param sample: Which sample in the VCF should be used.
    :return: A dictionary of contigs with dictionaries of chromosomes that have
    lists of mutations.
    """
    mutations_dict: Dict[str, Dict[int, List[Mutation]]] = {}
    vcf = cyvcf2.VCFReader(vcf_path)
    try:
        vcf.set_samples([sample])
        for record in vcf:  # type: cyvcf2.Variant
            contig = record.CHROM
            start = record.start
            end = record.end
            genotypes = [record.REF] + record.ALT
            # We take the genotypes from the first sample. Because there is
            # only one sample. We take all genotypes except the last column
            # because that is a boolean that indicates if it is phased or not.
            genotype_indexes = record.genotypes[0][:-1]
            if contig not in mutations_dict.keys():
                mutations_dict[contig] = {}
            for allele_no, genotype_idx in enumerate(genotype_indexes):
                mutation = Mutation(start, end, genotypes[genotype_idx])
                try:
                    mutations_dict[contig][allele_no].append(mutation)
                except KeyError:
                    mutations_dict[contig][allele_no] = [mutation]
    finally:
        vcf.close()

    return mutations_dict
예제 #2
0
 def _get_vcf_reader(self):
     return vcf.VCFReader(self.args.vcf)
예제 #3
0
        vcf_parsing.parse_vcf("{}.freebayes.normalized.vcf".format(sample),
                              "freebayes", caller_records)
        vcf_parsing.parse_vcf("{}.scalpel.normalized.vcf".format(sample),
                              "scalpel", caller_records)
        vcf_parsing.parse_vcf("{}.platypus.normalized.vcf".format(sample),
                              "platypus", caller_records)
        vcf_parsing.parse_vcf("{}.pindel.normalized.vcf".format(sample),
                              "pindel", caller_records)

        annotated_vcf = "{}.vcfanno.snpEff.GRCh37.75.vcf".format(sample)

        sys.stdout.write("Parsing VCFAnno VCF\n")
        vcf = VCF(annotated_vcf)

        sys.stdout.write("Parsing VCFAnno VCF with CyVCF2\n")
        reader = cyvcf2.VCFReader(annotated_vcf)
        desc = reader["ANN"]["Description"]
        annotation_keys = [
            x.strip("\"'") for x in re.split("\s*\|\s*",
                                             desc.split(":", 1)[1].strip('" '))
        ]

        # Filter out variants with minor allele frequencies above the threshold but
        # retain any that are above the threshold but in COSMIC or in ClinVar and not listed as benign.
        sys.stdout.write("Processing individual variants\n")

        lib_variant_inserts = list()
        variant_inserts = list()

        for var in vcf:
            # Parsing VCF and creating data structures for Cassandra model
예제 #4
0
def process_sample(job, parse_functions, sample, samples, config):
    caller_records = defaultdict(lambda: dict())

    sys.stdout.write("Parsing Caller VCF Files\n")
    vcf_parsing.parse_vcf("{}.mutect.normalized.vcf".format(sample),
                          "mutect", caller_records)
    vcf_parsing.parse_vcf("{}.vardict.normalized.vcf".format(sample),
                          "vardict", caller_records)
    vcf_parsing.parse_vcf("{}.freebayes.normalized.vcf".format(sample),
                          "freebayes", caller_records)
    vcf_parsing.parse_vcf("{}.scalpel.normalized.vcf".format(sample),
                          "scalpel", caller_records)
    vcf_parsing.parse_vcf("{}.platypus.normalized.vcf".format(sample),
                          "platypus", caller_records)
    vcf_parsing.parse_vcf("{}.pindel.normalized.vcf".format(sample),
                          "pindel", caller_records)

    annotated_vcf = "{}.vcfanno.snpEff.GRCh37.75.vcf".format(sample)

    sys.stdout.write("Parsing VCFAnno VCF\n")
    vcf = VCF(annotated_vcf)

    sys.stdout.write("Parsing VCFAnno VCF with CyVCF2\n")
    reader = cyvcf2.VCFReader(annotated_vcf)
    desc = reader["ANN"]["Description"]
    annotation_keys = [x.strip("\"'") for x in re.split("\s*\|\s*", desc.split(":", 1)[1].strip('" '))]

    sys.stdout.write("Processing individual variants\n")
    added = 0
    failed = 0
    for variant in vcf:
        # Parsing VCF and creating data structures for Cassandra model
        callers = variant.INFO.get('CALLERS').split(',')
        effects = utils.get_effects(variant, annotation_keys)
        top_impact = utils.get_top_impact(effects)
        population_freqs = utils.get_population_freqs(variant)
        amplicon_data = utils.get_amplicon_data(variant)

        key = (unicode("chr{}".format(variant.CHROM)), int(variant.start),
               int(variant.end), unicode(variant.REF), unicode(variant.ALT[0]))

        caller_variant_data_dicts = defaultdict(dict)
        max_som_aaf = -1.00
        max_depth = -1
        min_depth = 100000000

        for caller in callers:
            caller_variant_data_dicts[caller] = parse_functions[caller](caller_records[caller][key])
            if float(caller_variant_data_dicts[caller]['AAF']) > max_som_aaf:
                max_som_aaf = float(caller_variant_data_dicts[caller]['AAF'])
            if int(caller_variant_data_dicts[caller]['DP']) < min_depth:
                min_depth = int(caller_variant_data_dicts[caller]['DP'])
            if int(caller_variant_data_dicts[caller]['DP']) > max_depth:
                max_depth = int(caller_variant_data_dicts[caller]['DP'])

        if min_depth == 100000000:
            min_depth = -1

        # Create Cassandra Objects
        # Create the general variant ordered table
        try:
            cassandra_variant = Variant.create(
                    reference_genome=config['genome_version'],
                    chr=variant.CHROM,
                    pos=variant.start,
                    end=variant.end,
                    ref=variant.REF,
                    alt=variant.ALT[0],
                    sample=samples[sample]['sample_name'],
                    extraction=samples[sample]['extraction'],
                    library_name=samples[sample]['library_name'],
                    run_id=samples[sample]['run_id'],
                    panel_name=samples[sample]['panel'],
                    # initial_report_panel=samples[sample]['report'],
                    target_pool=samples[sample]['target_pool'],
                    sequencer=samples[sample]['sequencer'],
                    rs_id=variant.ID,
                    date_annotated=datetime.now(),
                    subtype=variant.INFO.get('sub_type'),
                    type=variant.INFO.get('type'),

                    gene=top_impact.gene,
                    transcript=top_impact.transcript,
                    exon=top_impact.exon,
                    codon_change=top_impact.codon_change,
                    biotype=top_impact.biotype,
                    aa_change=top_impact.aa_change,
                    severity=top_impact.effect_severity,
                    impact=top_impact.top_consequence,
                    impact_so=top_impact.so,

                    max_maf_all=variant.INFO.get('max_aaf_all') or -1,
                    max_maf_no_fin=variant.INFO.get('max_aaf_no_fin') or -1,
                    transcripts_data=utils.get_transcript_effects(effects),
                    clinvar_data=utils.get_clinvar_info(variant, samples, sample),
                    cosmic_data=utils.get_cosmic_info(variant),
                    in_clinvar=vcf_parsing.var_is_in_clinvar(variant),
                    in_cosmic=vcf_parsing.var_is_in_cosmic(variant),
                    is_pathogenic=vcf_parsing.var_is_pathogenic(variant),
                    is_lof=vcf_parsing.var_is_lof(variant),
                    is_coding=vcf_parsing.var_is_coding(variant),
                    is_splicing=vcf_parsing.var_is_splicing(variant),
                    rs_ids=vcf_parsing.parse_rs_ids(variant),
                    cosmic_ids=vcf_parsing.parse_cosmic_ids(variant),
                    callers=callers,
                    population_freqs=population_freqs,
                    amplicon_data=amplicon_data,
                    max_som_aaf=max_som_aaf,
                    min_depth=min_depth,
                    max_depth=max_depth,
                    mutect=caller_variant_data_dicts['mutect'] or dict(),
                    freebayes=caller_variant_data_dicts['freebayes'] or dict(),
                    scalpel=caller_variant_data_dicts['scalpel'] or dict(),
                    platypus=caller_variant_data_dicts['platypus'] or dict(),
                    pindel=caller_variant_data_dicts['pindel'] or dict(),
                    vardict=caller_variant_data_dicts['vardict'] or dict(),
                    manta=caller_variant_data_dicts['manta'] or dict()
                    )
            # End try
    with open("{}.sample_variant_add.log".format(samples[sample]['library_name']), "a") as err:
        err.write("Sample: {}\t Library: {}\n".format(samples[sample]['sample_name'],
                                                      samples[sample]['library_name'], ))
        err.write("Wrote {} variants to variantstore\n".format(added))
        err.write("Failed to add {} variants to variantstore\n".format(failed))

    job.fileStore.logToMaster("Variant data for {} variants saved to Cassandra for sample {}."
                              "{} variants failed to add to database\n".format(added, sample, failed))