示例#1
0
    def compute_hgvs(x):
        v = VCFVariant(x[CHR_COL], x[POS_COL], x[REF_COL], x[ALT_COL])
        v = v.to_hgvs_obj(hgvs_proc.contig_maps[HgvsWrapper.GRCh38_Assem])

        if normalize:
            vn = hgvs_proc.normalizing(v)
            v = vn if vn else v
        return hgvs_proc.genomic_to_cdna(v)
示例#2
0
def convert_to_hg37(vars, brca_resources_dir):
    def pseudo_vcf_entry(v):
        entries = [v.chr, v.pos, '.', v.ref, v.alt, '', '', '']
        return '\t'.join([str(s) for s in entries])

    lst = [pseudo_vcf_entry(v) for v in vars]

    vcf_tmp = tempfile.mktemp('.vcf')
    with open(vcf_tmp, 'w') as f:
        f.write('\n'.join(lst))

    vcf_tmp_out = tempfile.mktemp('.vcf')
    args = [
        "CrossMap.py", "vcf", brca_resources_dir + "/hg38ToHg19.over.chain.gz",
        vcf_tmp, brca_resources_dir + "/hg19.fa", vcf_tmp_out
    ]

    logging.info("Running CrossMap.py to convert to hg19")
    sp = subprocess.Popen(args)
    out, err = sp.communicate()
    if out:
        logging.info("standard output of subprocess: {}".format(out))
    if err:
        logging.info("standard output of subprocess: {}".format(err))

    vcf_out_lines = open(vcf_tmp_out, 'r').readlines()

    return [
        VCFVariant(v[0], int(v[1]), v[3], v[4])
        for v in [l.strip().split('\t') for l in vcf_out_lines]
    ]
def test_hg19_to_hg38(hgvs_wrapper):
    vars38 = [
        'chr13:g.32316477:AAG>A', 'chr17:g.43079204:A>C',
        'chr17:g.43053755:G>A', 'chr17:g.43125273:T>C', None
    ]  # outside transcript boundaries
    vars37 = [
        'chr13:g.32890614:AAG>A', 'chr17:g.41231221:A>C',
        'chr17:g.41205772:G>A', 'chr17:g.41277290:T>C',
        'chr17:g.41279883:GAC>G'
    ]

    for (v37, v38) in zip(vars37, vars38):
        v37_obj = VCFVariant.from_str(v37).to_hgvs_obj(
            hgvs_wrapper.contig_maps[hgvs_utils.HgvsWrapper.GRCh37_Assem])

        if v38:
            v38_obj = VCFVariant.from_hgvs_obj(
                hgvs_wrapper.hg19_to_hg38(v37_obj))
            assert VCFVariant.from_str(v38) == v38_obj
        else:
            with pytest.raises(ValueError):
                hgvs_wrapper.hg19_to_hg38(v37_obj)
示例#4
0
def from_genomic(parsedLine, fieldIdxDict, hgvs_wrapper, seq_fetcher37,
                 errorsFile):
    acc = 'NC_0000' + str(parsedLine[fieldIdxDict['chromosome']].replace(
        'chr', '')) + '.10'

    var_str = acc + ":" + parsedLine[fieldIdxDict['gDNA']]

    try:
        var_hgvs = hgvs_wrapper.hgvs_parser.parse(var_str)
        var_hgvs_norm = hgvs.normalizer.Normalizer(
            hgvs_wrapper.hgvs_dp, shuffle_direction=5).normalize(var_hgvs)
        return VCFVariant.from_hgvs_obj(var_hgvs_norm, seq_fetcher37)
    except HGVSError as e:
        print('Could not parse genomic field ' + str(var_str) +
              '. Error was ' + str(e),
              file=errorsFile)

    return None
def hgvs_variant(hgvs_wrapper):
    return (VCFVariant.from_str('chr13:g.32316477:AAG>A').to_hgvs_obj(
        hgvs_wrapper.contig_maps[hgvs_utils.HgvsWrapper.GRCh38_Assem]))
示例#6
0
def main(input, output, pkl, log_path, config_file, resources):
    logging.basicConfig(filename=log_path,
                        filemode="w",
                        level=logging.INFO,
                        format=' %(asctime)s %(filename)-15s %(message)s')

    cfg_df = config.load_config(config_file)

    syn_ac_dict = {
        x[config.SYMBOL_COL]: x[config.SYNONYM_AC_COL].split(';')
        for _, x in cfg_df.iterrows()
    }
    cdna_default_ac_dict = {
        x[config.SYMBOL_COL]: x[config.HGVS_CDNA_DEFAULT_AC]
        for _, x in cfg_df.iterrows()
    }

    hgvs_proc = HgvsWrapper()

    df = pd.read_csv(input, sep='\t')

    df[VAR_OBJ_FIELD] = df.apply(
        lambda x: VCFVariant(x[CHR_COL], x[POS_COL], x[REF_COL], x[ALT_COL]),
        axis=1)

    #### CDNA conversions
    df[TMP_CDNA_NORM_FIELD] = _get_cdna(df,
                                        pkl,
                                        hgvs_proc,
                                        cdna_default_ac_dict,
                                        normalize=True)
    df[PYHGVS_CDNA_COL] = df[TMP_CDNA_NORM_FIELD].apply(str)

    available_cdna = df[PYHGVS_CDNA_COL].str.startswith("NM_")
    df.loc[available_cdna, REFERENCE_SEQUENCE_COL] = df.loc[
        available_cdna, PYHGVS_CDNA_COL].str.split(':').apply(lambda l: l[0])
    df.loc[available_cdna, HGVS_CDNA_COL] = df.loc[
        available_cdna, PYHGVS_CDNA_COL].str.split(':').apply(lambda l: l[1])

    # still setting a reference sequence for downstream steps, even though no cDNA could be determined
    df.loc[~available_cdna, REFERENCE_SEQUENCE_COL] = df.loc[
        ~available_cdna,
        GENE_SYMBOL_COL].apply(lambda g: cdna_default_ac_dict[g])
    df.loc[~available_cdna, HGVS_CDNA_COL] = '-'

    #### Genomic Coordinates
    df[PYHGVS_GENOMIC_COORDINATE_38_COL] = df[VAR_OBJ_FIELD].apply(
        lambda v: str(v))

    var_objs_hg37 = convert_to_hg37(df[VAR_OBJ_FIELD], resources)
    df[PYHGVS_GENOMIC_COORDINATE_37_COL] = pd.Series(
        [str(v) for v in var_objs_hg37])

    df[PYHGVS_HG37_START_COL] = pd.Series([v.pos for v in var_objs_hg37])
    df[PYHGVS_HG37_END_COL] = df[PYHGVS_HG37_START_COL] + (df[HG38_END_COL] -
                                                           df[HG38_START_COL])

    #### Protein
    df[PYHGVS_PROTEIN_COL] = df[TMP_CDNA_NORM_FIELD].apply(
        lambda x: str(hgvs_proc.to_protein(x)))

    #### Synonyms
    df[NEW_SYNONYMS_FIELD] = df.apply(
        lambda s: get_synonyms(s, hgvs_proc, syn_ac_dict), axis=1)

    df[SYNONYMS_COL] = df[SYNONYMS_COL].fillna('').str.strip()

    # merge existing synonyms with generated ones and sort them
    df[SYNONYMS_COL] = df.apply(_merge_synonyms, axis=1)

    #### Writing out
    # cleaning up temporary fields
    df = df.drop(
        columns=[VAR_OBJ_FIELD, NEW_SYNONYMS_FIELD, TMP_CDNA_NORM_FIELD])

    df.to_csv(output, sep='\t', index=False)