Пример #1
0
Файл: gtf.py Проект: zmcv/seqr
def get_data_from_gencode_gtf(gtf_file):
    """
    Parse gencode GTF file
    Returns iter of (datatype, dict) tuples
    datatype is one of gene, transcript, exon, cds
    dict is the corresponding object
    """
    for line in gtf_file:
        if line.startswith('#'):
            continue
        fields = line.strip('\n').split('\t')

        if fields[2] not in ['gene', 'transcript', 'exon', 'CDS']:
            continue

        chrom = fields[0][3:]
        if len(chrom) > 3:
            continue # skip the pseudo contigs

        start = int(fields[3])  # GTF files are 1-indexed: http://www.ensembl.org/info/website/upload/gff.html
        stop = int(fields[4])
        info = dict(x.strip().split() for x in fields[8].split(';') if x != '')
        info = {k: v.strip('"') for k, v in info.items()}
        if 'gene_id' in info:
            info['gene_id'] = info['gene_id'].split('.')[0]

            # TODO: ignore all entities that are part of an ENSGR gene
            if info['gene_id'].startswith('ENSGR'):
                continue

        if 'transcript_id' in info:
            info['transcript_id'] = info['transcript_id'].split('.')[0]
        if 'exon_id' in info:
            info['exon_id'] = info['exon_id'].split('.')[0]

        info['chrom'] = chrom
        info['start'] = start
        info['stop'] = stop
        info['xstart'] = get_xpos(chrom, start)
        info['xstop'] = get_xpos(chrom, stop)

        # pretend 'CDS' isn't capitalized in gencode gtf file
        yield fields[2].lower(), info
Пример #2
0
def parse_clinvar_vcf(clinvar_vcf_path=None):
    """Load clinvar vcf file

    Rows have the following format:

    #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
    1\t949422\t475283\tG\tA\t.\t.\tALLELEID=446939;CLNDISDB=MedGen:C4015293,OMIM:616126,Orphanet:ORPHA319563;CLNDN=Immunodeficiency_38_with_basal_ganglia_calcification;CLNHGVS=NC_000001.10:g.949422G>A;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Benign;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=ISG15:9636;MC=SO:0001583|missense_variant;ORIGIN=1;RS=143888043

    Args:
        clinvar_vcf_path (string): optional alternate path
    """
    if clinvar_vcf_path is None:
        clinvar_vcf_path = settings.REFERENCE_SETTINGS.clinvar_vcf_file

    header = None

    clinvar_file = gzip.open(clinvar_vcf_path) if clinvar_vcf_path.endswith(".gz") else open(clinvar_vcf_path)

    for line in tqdm.tqdm(clinvar_file, unit=" clinvar records"):
        line = line.strip()
        if line.startswith("##"):
            continue

        if header is None:
            header = get_vcf_headers(line)
            continue

        fields = line.split("\t")
        fields = dict(zip(header, fields))
        info_fields = dict([info.split('=') for info in fields['INFO'].split(';')])
        fields.update(info_fields)
        chrom = fields["CHROM"]
        pos = int(fields["POS"])
        ref = fields["REF"]
        alt = fields["ALT"]
        variant_id = fields["ID"]

        if not valid_chrom(chrom):
            continue

        clinical_significance = fields.get("CLNSIG", "").lower()
        if clinical_significance in ["", "not provided", "other", "association"]:
            continue

        yield {
            'xpos': get_xpos(chrom, pos),
            'ref': ref,
            'alt': alt,
            'variant_id': variant_id,
            'clinsig': clinical_significance,
        }
Пример #3
0
            header = fields
        else:
            line_dict = dict(zip(header, fields))
            chrom = line_dict["chrom"]
            pos = int(line_dict["pos"])
            ref = line_dict["ref"]
            alt = line_dict["alt"]
            if "M" in chrom:
                continue   # because get_xpos doesn't support chrMT.
            clinical_significance = line_dict["clinical_significance"].lower()
            if clinical_significance in ["not provided", "other", "association"]:
                continue
            else:
                for c in clinical_significance.split(";"):
                    pathogenicity_values_counter[c] += 1
            xpos = get_xpos(chrom, pos)
            CLINVAR_VARIANTS[(xpos, ref, alt)] = (line_dict["measureset_id"], clinical_significance)
    #for k in sorted(pathogenicity_values_counter.keys(), key=lambda k: -pathogenicity_values_counter[k]):
    #    print("     %5d  %s"  % (pathogenicity_values_counter[k], k))

# set the secret key
if os.access("/etc/xbrowse_django_secret_key", os.R_OK):
    with open("/etc/xbrowse_django_secret_key") as f:
        SECRET_KEY = f.read().strip()

    SESSION_COOKIE_SECURE = True
    CSRF_COOKIE_SECURE = True

else:
    print("Warning: could not access /etc/xbrowse_django_secret_key. Falling back on insecure hard-coded SECRET_KEY")
    SECRET_KEY = "~~~ this key string is FOR DEVELOPMENT USE ONLY ~~~"
Пример #4
0
            header = fields
        else:
            line_dict = dict(zip(header, fields))
            chrom = line_dict["chrom"]
            pos = int(line_dict["pos"])
            ref = line_dict["ref"]
            alt = line_dict["alt"]
            if "M" in chrom:
                continue   # because get_xpos doesn't support chrMT.
            clinical_significance = line_dict["clinical_significance"].lower()
            if clinical_significance in ["not provided", "other", "association"]:
                continue
            else:
                for c in clinical_significance.split(";"):
                    pathogenicity_values_counter[c] += 1
            xpos = get_xpos(chrom, pos)
            CLINVAR_VARIANTS[(xpos, ref, alt)] = (line_dict["measureset_id"], clinical_significance)
    #for k in sorted(pathogenicity_values_counter.keys(), key=lambda k: -pathogenicity_values_counter[k]):
    #    print("     %5d  %s"  % (pathogenicity_values_counter[k], k))
    # print("%d variants loaded" % len(CLINVAR_VARIANTS))


if len(sys.argv) >= 2 and sys.argv[1] == 'test':
    # use in-memory sqlite database for running tests
    DATABASES['default'] = {
        'ENGINE': 'django.db.backends.sqlite3',
        'NAME': 'seqr_test_db.sqlite',
        'USER': '',
        'PASSWORD': '',
        'HOST': '',
        'PORT': '',
Пример #5
0
def get_clinvar_variants():
    global _CLINVAR_VARIANTS

    if _CLINVAR_VARIANTS is None:
        if not settings.CLINVAR_TSV:
            raise ValueError("settings.CLINVAR_TSV not set")

        if not os.path.isfile(settings.CLINVAR_TSV):
            raise ValueError("settings.CLINVAR_TSV file not found: %s" %
                             (settings.CLINVAR_TSV, ))

        _CLINVAR_VARIANTS = {}

        header = None
        pathogenicity_values_counter = collections.defaultdict(int)

        print("Reading Clinvar data into memory: " + settings.CLINVAR_TSV)
        for line in open(settings.CLINVAR_TSV):
            line = line.strip()
            if line.startswith("#"):
                continue
            fields = line.split("\t")
            if header is None:
                header = fields
                if "clinical_significance" not in line.lower():
                    raise ValueError(
                        "'clinical_significance' not found in header line: %s"
                        % str(header))
                continue

            try:
                if "clinical_significance" in line.lower():
                    raise ValueError(
                        "'clinical_significance' found in non-header line: %s"
                        % str(header))

                line_dict = dict(zip(header, fields))
                chrom = line_dict["chrom"]
                pos = int(line_dict["pos"])
                ref = line_dict["ref"]
                alt = line_dict["alt"]
                if "M" in chrom:
                    continue  # because get_xpos doesn't support chrMT.
                clinical_significance = line_dict[
                    "clinical_significance"].lower()
                if clinical_significance in [
                        "not provided", "other", "association"
                ]:
                    continue
                else:
                    for c in clinical_significance.split(";"):
                        pathogenicity_values_counter[c] += 1
                    xpos = get_xpos(chrom, pos)

                    _CLINVAR_VARIANTS[(xpos, ref,
                                       alt)] = (line_dict.get("variation_id")
                                                or
                                                line_dict.get("measureset_id"),
                                                clinical_significance)

                    #for k in sorted(pathogenicity_values_counter.keys(), key=lambda k: -pathogenicity_values_counter[k]):
                    #    sys.stderr.write(("     %5d  %s\n"  % (pathogenicity_values_counter[k], k)))
                    #sys.stderr.write("%d clinvar variants loaded \n" % len(CLINVAR_VARIANTS))

            except Exception as e:
                sys.stderr.write(
                    "Error while parsing clinvar row: \n%s\n %s\n" % (
                        line,
                        e,
                    ))

    return _CLINVAR_VARIANTS