예제 #1
0
파일: hgvs.py 프로젝트: p7k/tempus
    def from_assembly(cls,
                      assembly: Assembly,
                      alt_aln_method: str = 'splign') -> 'HgvsMachinery':
        """
        Initializes `biocommons/hgvs` machinery with Universal Transcript Archive (UTA) data provider.

        :param alt_aln_method: alignment method (default 'splign').
        :param assembly: determines the assembly build.
        """
        data_provider = uta.connect()
        assembly_name = ASSEMBLY__HGVS_ASSEMBLY_NAME[assembly]
        accession__contig = data_provider.get_assembly_map(
            assembly_name=assembly_name)
        return cls(assembly_mapper=AssemblyMapper(
            hdp=data_provider,
            assembly_name=assembly_name,
            alt_aln_method=alt_aln_method,
            add_gene_symbol=False,
            prevalidation_level='NONE',
            normalize=True,
            replace_reference=False),
                   normalizer_3p=Normalizer(hdp=data_provider,
                                            shuffle_direction=3),
                   normalizer_5p=Normalizer(hdp=data_provider,
                                            shuffle_direction=5),
                   accession__contig=accession__contig,
                   contig__accession={
                       contig: acc
                       for acc, contig in accession__contig.items()
                   })
예제 #2
0
def annotator(annotation):
    # 连接UTA, 创建Mapper、Parser、Normalizer
    hdp = connect()
    am = AssemblyMapper(hdp,
                        assembly_name=annotation,
                        alt_aln_method='splign',
                        replace_reference=True)
    hp = Parser()
    hn = Normalizer(hdp)
    return am, hp, hn, hdp
예제 #3
0
HGVS_SEQREPO_DIR environment variables but is otherwise not
configurable by the caller.

"""

from hgvs import __version__, global_config  # noqa: F401
from hgvs.assemblymapper import AssemblyMapper
from hgvs.dataproviders.uta import connect
from hgvs.normalizer import Normalizer
from hgvs.parser import Parser
from hgvs.validator import Validator
from hgvs.variantmapper import VariantMapper

# provide standard abbreviated, short, and long names for instances
hp = parser = hgvs_parser = Parser()
hdp = hgvs_data_provider = connect()
vm = variant_mapper = hgvs_variant_mapper = VariantMapper(hgvs_data_provider)
am37 = hgvs_assembly_mapper_37 = AssemblyMapper(hgvs_data_provider,
                                                assembly_name='GRCh37')
am38 = projector = hgvs_assembly_mapper_38 = AssemblyMapper(
    hgvs_data_provider, assembly_name='GRCh38')
hn = normalizer = hgvs_normalizer = Normalizer(hgvs_data_provider)
hv = validator = hgvs_validator = Validator(hgvs_data_provider)

# functionalized forms of common methods
parse = parser.parse
normalize = normalizer.normalize
validate = validator.validate
c_to_g = projector.c_to_g
c_to_n = projector.c_to_n
c_to_p = projector.c_to_p
def main(args):

    options = parse_args()
    brcaFile = options.inBRCA
    hg18_fa = options.inHg18
    hg19_fa = options.inHg19
    hg38_fa = options.inHg38
    refSeq18 = options.inRefSeq18
    refSeq19 = options.inRefSeq19
    refSeq38 = options.inRefSeq38
    outputFile = options.outBRCA
    calcProtein = options.calcProtein
    artifacts_dir = options.artifacts_dir

    if not os.path.exists(artifacts_dir):
        os.makedirs(artifacts_dir)
    log_file_path = artifacts_dir + "brca-pseudonym-generator.log"
    logging.basicConfig(filename=log_file_path,
                        filemode="w",
                        level=logging.DEBUG)

    hdp = hgvs_dataproviders_uta.connect()
    variantmapper = hgvs_variantmapper.EasyVariantMapper(hdp)
    hgvsparser = hgvs_parser.Parser()

    genome36 = SequenceFileDB(hg18_fa.name)
    genome37 = SequenceFileDB(hg19_fa.name)
    genome38 = SequenceFileDB(hg38_fa.name)

    transcripts36 = pyhgvs_utils.read_transcripts(refSeq18)
    transcripts37 = pyhgvs_utils.read_transcripts(refSeq19)
    transcripts38 = pyhgvs_utils.read_transcripts(refSeq38)

    def get_transcript36(name):
        return transcripts36.get(name)

    def get_transcript37(name):
        return transcripts37.get(name)

    def get_transcript38(name):
        return transcripts38.get(name)

    hgvsG36ColumnName = 'Genomic_Coordinate_hg36'
    hgvsG37ColumnName = 'Genomic_Coordinate_hg37'
    hgvsG38ColumnName = 'Genomic_Coordinate_hg38'
    refSeqColumnName = 'Reference_Sequence'
    hgvsCDNAColumnName = 'HGVS_cDNA'
    hgvsCDNALOVDColumnName = 'HGVS_cDNA_LOVD'
    hgvsPColumnName = 'HGVS_Protein'

    # Set up header for output file
    input_file = csv.reader(brcaFile, delimiter='\t')
    output_file = csv.writer(outputFile, delimiter='\t')
    input_header_row = input_file.next()

    # The following new columns will contain data generated by this file
    new_columns_to_append = [
        "pyhgvs_Genomic_Coordinate_36", "pyhgvs_Genomic_Coordinate_37",
        "pyhgvs_Genomic_Coordinate_38", "pyhgvs_Hg37_Start", "pyhgvs_Hg37_End",
        "pyhgvs_Hg36_Start", "pyhgvs_Hg36_End", "pyhgvs_cDNA", "pyhgvs_Protein"
    ]

    output_header_row = input_header_row + new_columns_to_append

    output_file.writerow(output_header_row)

    # Store indexes of the relevant columns
    hgvsG36Index = input_header_row.index(hgvsG36ColumnName)
    hgvsG37Index = input_header_row.index(hgvsG37ColumnName)
    hgvsG38Index = input_header_row.index(hgvsG38ColumnName)
    refSeqIndex = input_header_row.index(refSeqColumnName)
    hgvsCDNAIndex = input_header_row.index(hgvsCDNAColumnName)
    hgvsPIndex = input_header_row.index(hgvsPColumnName)
    hgvsCDNALOVDIndex = input_header_row.index(hgvsCDNALOVDColumnName)
    geneSymbolIndex = input_header_row.index("Gene_Symbol")
    synonymIndex = input_header_row.index("Synonyms")

    refSeqBRCA1Transcripts = [
        'NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3',
        'NM_007297.3', 'U14680.1'
    ]
    refSeqBRCA2Transcripts = ['U43746.1']

    for line in input_file:

        if line[geneSymbolIndex] == 'BRCA1':
            line[refSeqIndex] = 'NM_007294.3'
        elif line[geneSymbolIndex] == 'BRCA2':
            line[refSeqIndex] = 'NM_000059.3'

        # Store for reference and debugging
        oldHgvsGenomic38 = line[refSeqIndex] + ':' + line[hgvsG38Index].split(
            ',')[0]

        chrom38 = line[input_header_row.index("Chr")]
        offset38 = line[input_header_row.index("Pos")]
        ref38 = line[input_header_row.index("Ref")]
        alt38 = line[input_header_row.index("Alt")]

        # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser
        if ref38 == '-': ref38 = ''
        if alt38 == '-': alt38 = ''
        if alt38 == 'None': alt38 = ''
        transcript38 = get_transcript38(line[refSeqIndex])
        transcript37 = get_transcript37(line[refSeqIndex])
        transcript36 = get_transcript36(line[refSeqIndex])

        # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format
        if transcript38 is None:
            print("ERROR: could not parse transcript38 for variant: %s \n" %
                  (line))
            continue
        cdna_coord = str(
            pyhgvs.format_hgvs_name("chr" + chrom38,
                                    int(offset38),
                                    ref38,
                                    alt38,
                                    genome38,
                                    transcript38,
                                    use_gene=False,
                                    max_allele_length=100))
        chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome38, get_transcript=get_transcript38)
        chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome37, get_transcript=get_transcript37)
        chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome36, get_transcript=get_transcript36)

        # Generate transcript hgvs cdna synonym string
        if line[synonymIndex] == "-":
            synonymString = []
        elif line[synonymIndex] == "":
            synonymString = []
        else:
            synonymString = line[synonymIndex].split(",")
        if line[geneSymbolIndex] == 'BRCA1':
            for transcriptName in refSeqBRCA1Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(
                    pyhgvs.format_hgvs_name(chrom38,
                                            int(offset38),
                                            ref38,
                                            alt38,
                                            genome38,
                                            transcript38,
                                            use_gene=False,
                                            max_allele_length=100))
                synonymString.append(cdna_synonym)
        elif line[geneSymbolIndex] == 'BRCA2':
            for transcriptName in refSeqBRCA2Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(
                    pyhgvs.format_hgvs_name(chrom38,
                                            int(offset38),
                                            ref38,
                                            alt38,
                                            genome38,
                                            transcript38,
                                            use_gene=False,
                                            max_allele_length=100))
                synonymString.append(cdna_synonym)

        # Add hgvs_cDNA values from LOVD to synonyms if not already present
        for cdna_coord_LOVD in line[hgvsCDNALOVDIndex].split(','):
            # Skip if blank
            if cdna_coord_LOVD == "-" or cdna_coord_LOVD is None or cdna_coord_LOVD == "":
                continue

            # Don't add to synonyms if main hgvs_cDNA field is already equivalent to hgvs_cDNA value from LOVD
            cdna_coord_LOVD_for_comparison = cdna_coord_LOVD.split(':')[1]
            if cdna_coord_LOVD_for_comparison in line[hgvsCDNAIndex]:
                continue

            chrom38LOVD, offset38LOVD, ref38LOVD, alt38LOVD = pyhgvs.parse_hgvs_name(
                cdna_coord_LOVD, genome38, get_transcript=get_transcript38)
            if line[geneSymbolIndex] == 'BRCA1':
                for transcriptName in refSeqBRCA1Transcripts:
                    transcript38 = get_transcript38(transcriptName)
                    cdna_synonym = str(
                        pyhgvs.format_hgvs_name(chrom38LOVD,
                                                int(offset38LOVD),
                                                ref38LOVD,
                                                alt38LOVD,
                                                genome38,
                                                transcript38,
                                                use_gene=False,
                                                max_allele_length=100))
                    if cdna_synonym not in synonymString:
                        synonymString.append(cdna_synonym)
            elif line[geneSymbolIndex] == 'BRCA2':
                for transcriptName in refSeqBRCA2Transcripts:
                    transcript38 = get_transcript38(transcriptName)
                    cdna_synonym = str(
                        pyhgvs.format_hgvs_name(chrom38LOVD,
                                                int(offset38LOVD),
                                                ref38LOVD,
                                                alt38LOVD,
                                                genome38,
                                                transcript38,
                                                use_gene=False,
                                                max_allele_length=100))
                    if cdna_synonym not in synonymString:
                        synonymString.append(cdna_synonym)

        if calcProtein == True:

            try:
                var_c1 = hgvsparser.parse_hgvs_variant(cdna_coord)
                protein_coord = variantmapper.c_to_p(var_c1)
            except hgvs.exceptions.HGVSParseError as e:
                template = "An exception of type {0} occured. Arguments:\n{1!r}"
                message = template.format(type(ex).__name__, ex.args)
                genomicChange = '{0}:g.{1}:{2}>{3}'.format(
                    chrom38, offset38, ref38, alt38)
                print('hgvs.exceptions.HGVSParseError: ', e)
                print('Original GRCh38 Genomic Coordinate: ', oldHgvsGenomic38)
                print('GRCh38 Genomic change: ', genomicChange)
                logging.error(message)
                logging.error(line)
                logging.error('Proposed GRCh38 Genomic change for error: %s',
                              genomicChange)

            # Catch parse errors thrown by ometa.runtime.ParseError.
            except ParseError as ex:
                template = "An exception of type {0} occured. Arguments:\n{1!r}"
                message = template.format(type(ex).__name__, ex.args)
                genomicChange = '{0}:g.{1}:{2}>{3}'.format(
                    chrom38, offset38, ref38, alt38)
                print(message)
                print('ometa.runtime.ParseError', ex)
                print('Original GRCh38 Genomic Coordinate: ', oldHgvsGenomic38)
                print('GRCh38 Genomic change: ', genomicChange)
                logging.error(message)
                logging.error(line)
                logging.error('Proposed GRCh38 Genomic change for error: %s',
                              genomicChange)

        # Add empty data for each new column to prepare for data insertion by index
        for i in range(len(new_columns_to_append)):
            line.append('-')

        line[output_header_row.index(
            "pyhgvs_Genomic_Coordinate_36")] = '{0}:g.{1}:{2}>{3}'.format(
                chrom36, offset36, ref36, alt36)
        line[output_header_row.index(
            "pyhgvs_Genomic_Coordinate_37")] = '{0}:g.{1}:{2}>{3}'.format(
                chrom37, offset37, ref37, alt37)
        line[output_header_row.index(
            "pyhgvs_Genomic_Coordinate_38")] = '{0}:g.{1}:{2}>{3}'.format(
                chrom38, offset38, ref38, alt38)
        line[output_header_row.index("pyhgvs_Hg37_Start")] = str(offset37)
        line[output_header_row.index("pyhgvs_Hg37_End")] = str(
            int(offset37) + len(ref38) - 1)
        line[output_header_row.index("pyhgvs_Hg36_Start")] = str(offset36)
        line[output_header_row.index("pyhgvs_Hg36_End")] = str(
            int(offset36) + len(ref38) - 1)
        line[output_header_row.index("pyhgvs_cDNA")] = '{0}'.format(cdna_coord)
        if calcProtein == True:
            line[output_header_row.index("pyhgvs_Protein")] = '{0}'.format(
                str(protein_coord))
        line[synonymIndex] = ','.join(synonymString)

        output_file.writerow(line)

    hg18_fa.close()
    hg19_fa.close()
    hg38_fa.close()
    refSeq18.close()
    refSeq19.close()
    refSeq38.close()
def main(args):

    options = parse_args()
    brcaFile = options.inBRCA
    hg18_fa = options.inHg18
    hg19_fa = options.inHg19
    hg38_fa = options.inHg38
    refSeq18 = options.inRefSeq18
    refSeq19 = options.inRefSeq19
    refSeq38 = options.inRefSeq38
    outputFile = options.outBRCA
    calcProtein = options.calcProtein
    artifacts_dir = options.artifacts_dir

    if not os.path.exists(artifacts_dir):
        os.makedirs(artifacts_dir)
    log_file_path = artifacts_dir + "brca-pseudonym-generator.log"
    logging.basicConfig(filename=log_file_path, filemode="w", level=logging.DEBUG)

    hdp = hgvs_dataproviders_uta.connect()
    variantmapper = hgvs_variantmapper.EasyVariantMapper(hdp)
    hgvsparser = hgvs_parser.Parser()

    genome36 = SequenceFileDB(hg18_fa.name)
    genome37 = SequenceFileDB(hg19_fa.name)
    genome38 = SequenceFileDB(hg38_fa.name)

    transcripts36 = pyhgvs_utils.read_transcripts(refSeq18)
    transcripts37 = pyhgvs_utils.read_transcripts(refSeq19)
    transcripts38 = pyhgvs_utils.read_transcripts(refSeq38)

    def get_transcript36(name):
        return transcripts36.get(name)

    def get_transcript37(name):
        return transcripts37.get(name)

    def get_transcript38(name):
        return transcripts38.get(name)

    hgvsG36ColumnName = 'Genomic_Coordinate_hg36'
    hgvsG37ColumnName = 'Genomic_Coordinate_hg37'
    hgvsG38ColumnName = 'Genomic_Coordinate_hg38'
    refSeqColumnName = 'Reference_Sequence'
    hgvsCDNAColumnName = 'HGVS_cDNA'
    hgvsCDNALOVDColumnName = 'HGVS_cDNA_LOVD'
    hgvsPColumnName = 'HGVS_Protein'

    # Set up header for output file
    input_file = csv.reader(brcaFile, delimiter='\t')
    output_file = csv.writer(outputFile, delimiter='\t')
    input_header_row = input_file.next()

    # The following new columns will contain data generated by this file
    new_columns_to_append = ["pyhgvs_Genomic_Coordinate_36", "pyhgvs_Genomic_Coordinate_37",
                          "pyhgvs_Genomic_Coordinate_38", "pyhgvs_Hg37_Start", "pyhgvs_Hg37_End",
                          "pyhgvs_Hg36_Start", "pyhgvs_Hg36_End", "pyhgvs_cDNA", "pyhgvs_Protein"]

    output_header_row = input_header_row + new_columns_to_append

    output_file.writerow(output_header_row)

    # Store indexes of the relevant columns
    hgvsG36Index = input_header_row.index(hgvsG36ColumnName)
    hgvsG37Index = input_header_row.index(hgvsG37ColumnName)
    hgvsG38Index = input_header_row.index(hgvsG38ColumnName)
    refSeqIndex = input_header_row.index(refSeqColumnName)
    hgvsCDNAIndex = input_header_row.index(hgvsCDNAColumnName)
    hgvsPIndex = input_header_row.index(hgvsPColumnName)
    hgvsCDNALOVDIndex = input_header_row.index(hgvsCDNALOVDColumnName)
    geneSymbolIndex = input_header_row.index("Gene_Symbol")
    synonymIndex = input_header_row.index("Synonyms")

    refSeqBRCA1Transcripts = ['NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3', 'NM_007297.3', 'U14680.1']
    refSeqBRCA2Transcripts = ['U43746.1']

    for line in input_file:

        if line[geneSymbolIndex] == 'BRCA1':
            line[refSeqIndex] = 'NM_007294.3'
        elif line[geneSymbolIndex] == 'BRCA2':
            line[refSeqIndex] = 'NM_000059.3'

        # Store for reference and debugging
        oldHgvsGenomic38 = line[refSeqIndex] + ':' + line[hgvsG38Index].split(',')[0]

        chrom38 = line[input_header_row.index("Chr")]
        offset38 = line[input_header_row.index("Pos")]
        ref38 = line[input_header_row.index("Ref")]
        alt38 = line[input_header_row.index("Alt")]

        # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser
        if ref38 == '-': ref38 = ''
        if alt38 == '-': alt38 = ''
        if alt38 == 'None': alt38 = ''
        transcript38 = get_transcript38(line[refSeqIndex])
        transcript37 = get_transcript37(line[refSeqIndex])
        transcript36 = get_transcript36(line[refSeqIndex])

        # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format
        if transcript38 is None:
            print("ERROR: could not parse transcript38 for variant: %s \n" % (line))
            continue
        cdna_coord = str(pyhgvs.format_hgvs_name("chr" + chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100))
        chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name(cdna_coord, genome38, get_transcript=get_transcript38)
        chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name(cdna_coord, genome37, get_transcript=get_transcript37)
        chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name(cdna_coord, genome36, get_transcript=get_transcript36)

        # Generate transcript hgvs cdna synonym string
        if line[synonymIndex] == "-":
            synonymString = []
        elif line[synonymIndex] == "":
            synonymString = []
        else:
            synonymString = line[synonymIndex].split(",")
        if line[geneSymbolIndex] == 'BRCA1':
            for transcriptName in refSeqBRCA1Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100))
                synonymString.append(cdna_synonym)
        elif line[geneSymbolIndex] == 'BRCA2':
            for transcriptName in refSeqBRCA2Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100))
                synonymString.append(cdna_synonym)

        # Add hgvs_cDNA values from LOVD to synonyms if not already present
        for cdna_coord_LOVD in line[hgvsCDNALOVDIndex].split(','):
            # Skip if blank
            if cdna_coord_LOVD == "-" or cdna_coord_LOVD is None or cdna_coord_LOVD == "":
                continue

            # Don't add to synonyms if main hgvs_cDNA field is already equivalent to hgvs_cDNA value from LOVD
            cdna_coord_LOVD_for_comparison = cdna_coord_LOVD.split(':')[1]
            if cdna_coord_LOVD_for_comparison in line[hgvsCDNAIndex]:
                continue

            chrom38LOVD, offset38LOVD, ref38LOVD, alt38LOVD = pyhgvs.parse_hgvs_name(cdna_coord_LOVD, genome38, get_transcript=get_transcript38)
            if line[geneSymbolIndex] == 'BRCA1':
                for transcriptName in refSeqBRCA1Transcripts:
                    transcript38 = get_transcript38(transcriptName)
                    cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100))
                    if cdna_synonym not in synonymString:
                        synonymString.append(cdna_synonym)
            elif line[geneSymbolIndex] == 'BRCA2':
                for transcriptName in refSeqBRCA2Transcripts:
                    transcript38 = get_transcript38(transcriptName)
                    cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100))
                    if cdna_synonym not in synonymString:
                        synonymString.append(cdna_synonym)

        if calcProtein == True:

            try:
                var_c1 = hgvsparser.parse_hgvs_variant(cdna_coord)
                protein_coord = variantmapper.c_to_p(var_c1)
            except hgvs.exceptions.HGVSParseError as e:
                template = "An exception of type {0} occured. Arguments:\n{1!r}"
                message = template.format(type(ex).__name__, ex.args)
                genomicChange = '{0}:g.{1}:{2}>{3}'.format(chrom38, offset38, ref38, alt38)
                print('hgvs.exceptions.HGVSParseError: ', e)
                print('Original GRCh38 Genomic Coordinate: ', oldHgvsGenomic38)
                print('GRCh38 Genomic change: ', genomicChange)
                logging.error(message)
                logging.error(line)
                logging.error('Proposed GRCh38 Genomic change for error: %s', genomicChange)

            # Catch parse errors thrown by ometa.runtime.ParseError.
            except ParseError as ex:
                template = "An exception of type {0} occured. Arguments:\n{1!r}"
                message = template.format(type(ex).__name__, ex.args)
                genomicChange = '{0}:g.{1}:{2}>{3}'.format(chrom38, offset38, ref38, alt38)
                print(message)
                print('ometa.runtime.ParseError', ex)
                print('Original GRCh38 Genomic Coordinate: ', oldHgvsGenomic38)
                print('GRCh38 Genomic change: ', genomicChange)
                logging.error(message)
                logging.error(line)
                logging.error('Proposed GRCh38 Genomic change for error: %s', genomicChange)

        # Add empty data for each new column to prepare for data insertion by index
        for i in range(len(new_columns_to_append)):
            line.append('-')

        line[output_header_row.index("pyhgvs_Genomic_Coordinate_36")] = '{0}:g.{1}:{2}>{3}'.format(chrom36,offset36,ref36,alt36)
        line[output_header_row.index("pyhgvs_Genomic_Coordinate_37")] = '{0}:g.{1}:{2}>{3}'.format(chrom37,offset37,ref37,alt37)
        line[output_header_row.index("pyhgvs_Genomic_Coordinate_38")] = '{0}:g.{1}:{2}>{3}'.format(chrom38,offset38,ref38,alt38)
        line[output_header_row.index("pyhgvs_Hg37_Start")] = str(offset37)
        line[output_header_row.index("pyhgvs_Hg37_End")] = str(int(offset37) + len(ref38) - 1)
        line[output_header_row.index("pyhgvs_Hg36_Start")] = str(offset36)
        line[output_header_row.index("pyhgvs_Hg36_End")] = str(int(offset36) + len(ref38) - 1)
        line[output_header_row.index("pyhgvs_cDNA")] = '{0}'.format(cdna_coord)
        if calcProtein == True:
            line[output_header_row.index("pyhgvs_Protein")] = '{0}'.format(str(protein_coord))
        line[synonymIndex] = ','.join(synonymString)

        output_file.writerow(line)

    hg18_fa.close()
    hg19_fa.close()
    hg38_fa.close()
    refSeq18.close()
    refSeq19.close()
    refSeq38.close()
예제 #6
0
def main(args):

    options = parse_args()
    brcaFile = options.inBRCA
    hg18_fa = options.inHg18
    hg19_fa = options.inHg19
    hg38_fa = options.inHg38
    refSeq18 = options.inRefSeq18
    refSeq19 = options.inRefSeq19
    refSeq38 = options.inRefSeq38
    outputFile = options.outBRCA
    calcProtein = options.calcProtein

    hdp = hgvs_dataproviders_uta.connect()
    variantmapper = hgvs_variantmapper.EasyVariantMapper(hdp)
    hgvsparser = hgvs_parser.Parser()

    genome36 = SequenceFileDB(hg18_fa.name)
    genome37 = SequenceFileDB(hg19_fa.name)
    genome38 = SequenceFileDB(hg38_fa.name)

    transcripts36 = pyhgvs_utils.read_transcripts(refSeq18)
    transcripts37 = pyhgvs_utils.read_transcripts(refSeq19)
    transcripts38 = pyhgvs_utils.read_transcripts(refSeq38)

    def get_transcript36(name):
        return transcripts36.get(name)

    def get_transcript37(name):
        return transcripts37.get(name)

    def get_transcript38(name):
        return transcripts38.get(name)

    hgvsG36ColumnName = 'Genomic_Coordinate_hg36'
    hgvsG37ColumnName = 'Genomic_Coordinate_hg37'
    hgvsG38ColumnName = 'Genomic_Coordinate_hg38'
    refSeqColumnName = 'Reference_Sequence'
    hgvsCDNAColumnName = 'HGVS_cDNA'
    hgvsPColumnName = 'HGVS_Protein'

    labelLine = brcaFile.readline().rstrip().split('\t')
    writeLine = '\t'.join(labelLine) + '\n'
    outputFile.writelines(writeLine)

    # Store indexes of the relevant columns
    hgvsG36Index = labelLine.index(hgvsG36ColumnName)
    hgvsG37Index = labelLine.index(hgvsG37ColumnName)
    hgvsG38Index = labelLine.index(hgvsG38ColumnName)
    refSeqIndex = labelLine.index(refSeqColumnName)
    hgvsCDNAIndex = labelLine.index(hgvsCDNAColumnName)
    hgvsPIndex = labelLine.index(hgvsPColumnName)
    geneSymbolIndex = labelLine.index("Gene_Symbol")
    synonymIndex = labelLine.index("Synonyms")

    refSeqBRCA1Transcripts = [
        'NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3',
        'NM_007297.3', 'U14680.1'
    ]
    refSeqBRCA2Transcripts = ['U43746.1']

    for line in brcaFile:
        parsedLine = line.rstrip().split('\t')

        if parsedLine[geneSymbolIndex] == 'BRCA1':
            parsedLine[refSeqIndex] = 'NM_007294.3'
        elif parsedLine[geneSymbolIndex] == 'BRCA2':
            parsedLine[refSeqIndex] = 'NM_000059.3'

        # Format genomic variant position strings to contain relevant refseq strings
        oldHgvsGenomic36 = parsedLine[refSeqIndex] + ':' + parsedLine[
            hgvsG36Index]
        oldHgvsGenomic37 = parsedLine[refSeqIndex] + ':' + parsedLine[
            hgvsG37Index]
        oldHgvsGenomic38 = parsedLine[refSeqIndex] + ':' + parsedLine[
            hgvsG38Index].split(',')[0]
        oldHgvsCDNA = parsedLine[refSeqIndex] + ':' + parsedLine[hgvsCDNAIndex]

        chrom38 = oldHgvsGenomic38.split(':')[1]
        offset38 = oldHgvsGenomic38.split(':')[2]
        ref38 = oldHgvsGenomic38.split(':')[3].split('>')[0]
        alt38 = oldHgvsGenomic38.split(':')[3].split('>')[1]

        # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser
        if ref38 == '-': ref38 = ''
        if alt38 == '-': alt38 = ''
        if alt38 == 'None': alt38 = ''

        transcript38 = get_transcript38(parsedLine[refSeqIndex])
        transcript37 = get_transcript37(parsedLine[refSeqIndex])
        transcript36 = get_transcript36(parsedLine[refSeqIndex])

        # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format
        cdna_coord = str(
            pyhgvs.format_hgvs_name(chrom38,
                                    int(offset38),
                                    ref38,
                                    alt38,
                                    genome38,
                                    transcript38,
                                    use_gene=False,
                                    max_allele_length=100))

        chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome38, get_transcript=get_transcript38)
        chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome37, get_transcript=get_transcript37)
        chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome36, get_transcript=get_transcript36)

        # Generate transcript hgvs cdna synonym string
        synonymString = []
        if parsedLine[geneSymbolIndex] == 'BRCA1':
            for transcriptName in refSeqBRCA1Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(
                    pyhgvs.format_hgvs_name(chrom38,
                                            int(offset38),
                                            ref38,
                                            alt38,
                                            genome38,
                                            transcript38,
                                            use_gene=False,
                                            max_allele_length=100))
                synonymString.append(cdna_synonym)
        elif parsedLine[geneSymbolIndex] == 'BRCA2':
            for transcriptName in refSeqBRCA2Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(
                    pyhgvs.format_hgvs_name(chrom38,
                                            int(offset38),
                                            ref38,
                                            alt38,
                                            genome38,
                                            transcript38,
                                            use_gene=False,
                                            max_allele_length=100))
                synonymString.append(cdna_synonym)

        if calcProtein == True:
            #print('oldHgvsGenomic38:', oldHgvsGenomic38)
            #print('oldHgvsCDNA: ', oldHgvsCDNA)
            #print('cdna: ', cdna_coord)

            try:
                var_c1 = hgvsparser.parse_hgvs_variant(cdna_coord)
                protein_coord = variantmapper.c_to_p(var_c1)
            except hgvs.exceptions.HGVSParseError as e:
                print('hgvs.exceptions.HGVSParseError: ', e)
                print(
                    'GRCh38 Genomic change: ',
                    '{0}:{1}:{2}>{3}'.format(chrom38, offset38, ref38, alt38))
                print('')
            #print('oldProtein: ', parsedLine[hgvsPIndex])
            #print('protein:', protein_coord)
            #print('')

        # write new data into line
        parsedLine[hgvsG36Index] = '{0}:{1}:{2}>{3}'.format(
            chrom36, offset36, ref36, alt36)
        parsedLine[hgvsG37Index] = '{0}:{1}:{2}>{3}'.format(
            chrom37, offset37, ref37, alt37)
        parsedLine[hgvsG38Index] = '{0}:{1}:{2}>{3}'.format(
            chrom38, offset38, ref38, alt38)
        parsedLine[hgvsCDNAIndex] = '{0}'.format(cdna_coord)
        if calcProtein == True:
            parsedLine[hgvsPIndex] = '{0}'.format(str(protein_coord))
        parsedLine[synonymIndex] = ','.join(synonymString)
        writeLine = '\t'.join(parsedLine) + '\n'
        outputFile.writelines(writeLine)

    hg18_fa.close()
    hg19_fa.close()
    hg38_fa.close()
    refSeq18.close()
    refSeq19.close()
    refSeq38.close()
    outputFile.close()