def setup_clinvar_file(): local_storage = os.path.join(settings.LOCAL_STORAGE_ROOT, 'genome_processing_files') if not os.path.exists(local_storage): os.makedirs(local_storage) clinvar_filepath = clinvar_update.get_latest_vcf_file( target_dir=local_storage, build='b37') if clinvar_filepath.endswith('.bz2'): clinvar_file = bz2.BZ2File(clinvar_filepath, 'rb') elif clinvar_filepath.endswith('.gz'): clinvar_file = gzip.open(clinvar_filepath, 'rb') else: clinvar_file = open(clinvar_filepath) return clinvar_file
def setup_clinvar_data(): local_storage = os.path.join(settings.LOCAL_STORAGE_ROOT, 'genome_processing_files') if not os.path.exists(local_storage): os.makedirs(local_storage) clinvar_filepath = clinvar_update.get_latest_vcf_file( target_dir=local_storage, build='b37') clinvar_sig_filepath = '{}.sigposlist.json.gz'.format(clinvar_filepath) if os.path.exists(clinvar_sig_filepath): clinvar_sig_file = gzip.open(clinvar_sig_filepath, 'rt') clinvar_sig = json.load(clinvar_sig_file) else: clinvar_sig = generate_clinvar_sig(clinvar_filepath, clinvar_sig_filepath) return set(clinvar_sig)
def match_genome(inputfile, outputfile, inputfilename): """ Produce a CSV genome report at outputfile for a given VCF inputfile. """ data = dict() # Set up ClinVar data. clinvar_filepath = clinvar_update.get_latest_vcf_file(FILESDIR, 'b37') if clinvar_filepath.endswith('.vcf'): input_clinvar_file = open(clinvar_filepath) elif clinvar_filepath.endswith('.vcf.gz'): input_clinvar_file = gzip.open(clinvar_filepath) elif clinvar_filepath.endswith('.vcf.bz2'): input_clinvar_file = bz2.BZ2File(clinvar_filepath) else: raise IOError("ClinVar filename expected to end with '.vcf'," + " '.vcf.gz', or '.vcf.bz2'.") # Run vcf2clinvar on genome data. clinvar_matches = vcf2clinvar.match_to_clinvar( inputfile, input_clinvar_file) # Set up to get myvariant.info data (mainly for ExAC data.) mv = myvariant.MyVariantInfo() # iterate through all ClinVar matches. for genome_vcf_line, allele, zygosity in clinvar_matches: # Discard low quality data. if genome_vcf_line.filters and 'PASS' not in genome_vcf_line.filters: continue # Check significance. Only keep this as a notable variant if one of the # submissions has reported "pathogenic" and "likely pathogenic" effect. sigs = [rec.sig for rec in allele.records] if not ('4' in sigs or '5' in sigs): continue # Store data in a dict according to HGVS position. poskey = myvariant.format_hgvs( genome_vcf_line.chrom, genome_vcf_line.start, genome_vcf_line.ref_allele, allele.sequence) data[poskey] = {'genome_vcf_line': genome_vcf_line, 'clinvar_allele': allele, 'zygosity': zygosity} # Add data from myvariant.info using the HGVS positions. variants = data.keys() mv_output = mv.getvariants(variants, fields=['clinvar', 'exac']) for i in range(len(variants)): if 'clinvar' in mv_output[i]: data[variants[i]]['mv_clinvar'] = mv_output[i]['clinvar'] if 'exac' in mv_output[i]: data[variants[i]]['mv_exac'] = mv_output[i]['exac'] # Write report as CSV. with open(outputfile, 'w') as f: csv_out = csv.writer(f) for var in variants: # Clinvar URL for variant. cv_url = 'http://www.ncbi.nlm.nih.gov/clinvar/{}/'.format( data[var]['clinvar_allele'].records[0].acc) disease_name = '' preferred_name = '' getev_url = '' # Disease name, preferred name, and GET-Evidence URL if we have # myvariant.info information with ClinVar data. if 'mv_clinvar' in data[var]: cv_url = 'http://www.ncbi.nlm.nih.gov/clinvar/variation/{}/'.format( data[var]['mv_clinvar']['variant_id']) try: disease_name = data[var]['mv_clinvar']['rcv']['conditions']['name'] preferred_name = data[var]['mv_clinvar']['rcv']['preferred_name'] except TypeError: disease_name = ', '.join( set([rcv['conditions']['name'] for rcv in data[var]['mv_clinvar']['rcv']])) preferred_name = data[var]['mv_clinvar']['rcv'][0]['preferred_name'] getev_url = guess_getevidence_url(preferred_name) exac_url = 'http://exac.broadinstitute.org/variant/{}-{}-{}-{}'.format( data[var]['genome_vcf_line'].chrom[3:], data[var]['genome_vcf_line'].start, data[var]['genome_vcf_line'].ref_allele, data[var]['clinvar_allele'].sequence) # Allele frequency using ExAC data, if myvariant.info had that. if 'mv_exac' in data[var]: total_freq = data[var]['mv_exac']['ac']['ac'] * 1.0 / data[var]['mv_exac']['an']['an'] total_freq = str(total_freq) freq_source = 'ExAC' else: # If not, try to get it from our ClinVar data. try: total_freq = str(data[var]['clinvar_allele'].frequency) freq_source = 'ClinVar' except KeyError: # If that fails, give up on frequency. total_freq = '' freq_source = 'Unknown' data_row = [ inputfilename, var, preferred_name, disease_name, cv_url, exac_url, total_freq, freq_source, getev_url] csv_out.writerow(data_row) return
def match_genome(inputfile, outputfile, inputfilename): """ Produce a CSV genome report at outputfile for a given VCF inputfile. """ data = dict() # Set up ClinVar data. clinvar_filepath = clinvar_update.get_latest_vcf_file(FILESDIR, 'b37') if clinvar_filepath.endswith('.vcf'): input_clinvar_file = open(clinvar_filepath) elif clinvar_filepath.endswith('.vcf.gz'): input_clinvar_file = gzip.open(clinvar_filepath) elif clinvar_filepath.endswith('.vcf.bz2'): input_clinvar_file = bz2.BZ2File(clinvar_filepath) else: raise IOError("ClinVar filename expected to end with '.vcf'," + " '.vcf.gz', or '.vcf.bz2'.") # Run vcf2clinvar on genome data. clinvar_matches = vcf2clinvar.match_to_clinvar(inputfile, input_clinvar_file) # Set up to get myvariant.info data (mainly for ExAC data.) mv = myvariant.MyVariantInfo() # iterate through all ClinVar matches. for genome_vcf_line, allele, zygosity in clinvar_matches: # Discard low quality data. if genome_vcf_line.filters and 'PASS' not in genome_vcf_line.filters: continue # Check significance. Only keep this as a notable variant if one of the # submissions has reported "pathogenic" and "likely pathogenic" effect. sigs = [rec.sig for rec in allele.records] if not ('4' in sigs or '5' in sigs): continue # Store data in a dict according to HGVS position. poskey = myvariant.format_hgvs(genome_vcf_line.chrom, genome_vcf_line.start, genome_vcf_line.ref_allele, allele.sequence) data[poskey] = { 'genome_vcf_line': genome_vcf_line, 'clinvar_allele': allele, 'zygosity': zygosity } # Add data from myvariant.info using the HGVS positions. variants = data.keys() mv_output = mv.getvariants(variants, fields=['clinvar', 'exac']) for i in range(len(variants)): if 'clinvar' in mv_output[i]: data[variants[i]]['mv_clinvar'] = mv_output[i]['clinvar'] if 'exac' in mv_output[i]: data[variants[i]]['mv_exac'] = mv_output[i]['exac'] # Write report as CSV. with open(outputfile, 'w') as f: csv_out = csv.writer(f) for var in variants: # Clinvar URL for variant. cv_url = 'http://www.ncbi.nlm.nih.gov/clinvar/{}/'.format( data[var]['clinvar_allele'].records[0].acc) disease_name = '' preferred_name = '' getev_url = '' # Disease name, preferred name, and GET-Evidence URL if we have # myvariant.info information with ClinVar data. if 'mv_clinvar' in data[var]: cv_url = 'http://www.ncbi.nlm.nih.gov/clinvar/variation/{}/'.format( data[var]['mv_clinvar']['variant_id']) try: disease_name = data[var]['mv_clinvar']['rcv'][ 'conditions']['name'] preferred_name = data[var]['mv_clinvar']['rcv'][ 'preferred_name'] except TypeError: disease_name = ', '.join( set([ rcv['conditions']['name'] for rcv in data[var]['mv_clinvar']['rcv'] ])) preferred_name = data[var]['mv_clinvar']['rcv'][0][ 'preferred_name'] getev_url = guess_getevidence_url(preferred_name) exac_url = 'http://exac.broadinstitute.org/variant/{}-{}-{}-{}'.format( data[var]['genome_vcf_line'].chrom[3:], data[var]['genome_vcf_line'].start, data[var]['genome_vcf_line'].ref_allele, data[var]['clinvar_allele'].sequence) # Allele frequency using ExAC data, if myvariant.info had that. if 'mv_exac' in data[var]: total_freq = data[var]['mv_exac']['ac']['ac'] * 1.0 / data[ var]['mv_exac']['an']['an'] total_freq = str(total_freq) freq_source = 'ExAC' else: # If not, try to get it from our ClinVar data. try: total_freq = str(data[var]['clinvar_allele'].frequency) freq_source = 'ClinVar' except KeyError: # If that fails, give up on frequency. total_freq = '' freq_source = 'Unknown' data_row = [ inputfilename, var, preferred_name, disease_name, cv_url, exac_url, total_freq, freq_source, getev_url ] csv_out.writerow(data_row) return
def main(): """ Parse command line argument and output appropriate file type (csv or JSON) """ parser = ArgumentParser() parser.add_argument( "-c", "--clinvarfile", dest="clinvarfile", help="ClinVar VCF file (either this or -C must be specified)", metavar="CLINVARFILE") parser.add_argument( "-C", "--clinvardir", dest="clinvardir", help="ClinVar VCF directory (either this or -c must be specified). " + "This option will use vcf2clinvar.clinvar_update to automatically " + "check and import the most recent ClinVar file to this directory.", metavar="CLINVARDIR") parser.add_argument( "-i", "--input", dest="inputfile", help="Input VCF file ['.vcf', '.vcf.gz', '.vcf.bz2']. " + "Uncompressed genome data is also accepted via stdin.", metavar="INPUT") parser.add_argument( "-t", "--type", dest="type", default='csv', help="Output report type ('csv' or 'json'). Defaults to csv. " + "CSV Report: Reports all genome variants matching ClinVar records, " + "and some summary ClinVar data from these records. Header lines " + "with metadata begin with '##'.\n" + "JSON Report: Reports genome variants matching ClinVar records " + "(no record information is included).", metavar="TYPE") parser.add_argument( "-n", "--notes", dest="notes", help="Notes (JSON format) to include in report. (JSON report only)", metavar="NOTES") parser.add_argument( "-g", "--genome-build", dest="build", help="Genome build to include in report ('b37' or 'b38').", metavar="GENOMEBUILD") options = parser.parse_args() #version = os.popen("python setup.py --version").read().strip() version = "0.1.2a" if options.inputfile: if options.inputfile.endswith('.vcf'): input_genome_file = open(options.inputfile) elif options.inputfile.endswith('.vcf.gz'): input_genome_file = gzip.open(options.inputfile) elif options.inputfile.endswith('.vcf.bz2'): input_genome_file = bz2.BZ2File(options.inputfile) else: raise IOError("Genome filename expected to end with ''.vcf'," + " '.vcf.gz', or '.vcf.bz2'.") elif not sys.stdin.isatty(): input_genome_file = sys.stdin else: sys.stderr.write("Provide input VCF file\n") parser.print_help() sys.exit(1) if options.build and options.build in ['b37', 'b38']: build = options.build else: raise IOError("Input VCF genome build must be 'b37' or 'b38'.") if (not (options.clinvarfile or options.clinvardir) or (options.clinvarfile and options.clinvardir)): sys.stderr.write("Please provide either a ClinVar file or directory.") parser.print_help() sys.exit(1) if options.clinvarfile: clinvarfilename = options.clinvarfile elif options.clinvardir: clinvarfilename = get_latest_vcf_file(target_dir=options.clinvardir, build=build) if clinvarfilename.endswith('.vcf'): input_clinvar_file = open(options.clinvarfile) elif clinvarfilename.endswith('.vcf.gz'): input_clinvar_file = gzip.open(clinvarfilename) elif clinvarfilename.endswith('.vcf.bz2'): input_clinvar_file = bz2.BZ2File(clinvarfilename) else: raise IOError("ClinVar filename expected to end with '.vcf'," + " '.vcf.gz', or '.vcf.bz2'.") if options.type not in ['csv', 'json']: raise IOError("Not a valid report type, must be 'csv' or 'json'.") if options.type == "csv": csv_report(input_genome_file=input_genome_file, input_clinvar_file=input_clinvar_file, build=build, version=version) elif options.type == "json": notes_json = {} if options.notes: notes_json["parameter"] = options.notes try: notes_json = json.loads(options.notes) except: sys.stderr.write("Could not parse JSON notes field\n") json_report(input_genome_file=input_genome_file, input_clinvar_file=input_clinvar_file, build=build, notes=notes_json, version=version)
def main(): """ Parse command line argument and output appropriate file type (csv or JSON) """ parser = ArgumentParser() parser.add_argument( "-c", "--clinvarfile", dest="clinvarfile", help="ClinVar VCF file (either this or -C must be specified)", metavar="CLINVARFILE") parser.add_argument( "-C", "--clinvardir", dest="clinvardir", help="ClinVar VCF directory (either this or -c must be specified). " + "This option will use vcf2clinvar.clinvar_update to automatically " + "check and import the most recent ClinVar file to this directory.", metavar="CLINVARDIR") parser.add_argument( "-i", "--input", dest="inputfile", help="Input VCF file ['.vcf', '.vcf.gz', '.vcf.bz2']. " + "Uncompressed genome data is also accepted via stdin.", metavar="INPUT") parser.add_argument( "-t", "--type", dest="type", default='csv', help="Output report type ('csv' or 'json'). Defaults to csv. " + "CSV Report: Reports all genome variants matching ClinVar records, " + "and some summary ClinVar data from these records. Header lines " + "with metadata begin with '##'.\n" + "JSON Report: Reports genome variants matching ClinVar records " + "(no record information is included).", metavar="TYPE") parser.add_argument( "-n", "--notes", dest="notes", help="Notes (JSON format) to include in report. (JSON report only)", metavar="NOTES") parser.add_argument( "-g", "--genome-build", dest="build", help="Genome build to include in report ('b37' or 'b38').", metavar="GENOMEBUILD") options = parser.parse_args() version = os.popen("python setup.py --version").read().strip() if options.inputfile: if options.inputfile.endswith('.vcf'): input_genome_file = open(options.inputfile) elif options.inputfile.endswith('.vcf.gz'): input_genome_file = gzip.open(options.inputfile) elif options.inputfile.endswith('.vcf.bz2'): input_genome_file = bz2.BZ2File(options.inputfile) else: raise IOError("Genome filename expected to end with ''.vcf'," + " '.vcf.gz', or '.vcf.bz2'.") elif not sys.stdin.isatty(): input_genome_file = sys.stdin else: sys.stderr.write("Provide input VCF file\n") parser.print_help() sys.exit(1) if options.build and options.build in ['b37', 'b38']: build = options.build else: raise IOError("Input VCF genome build must be 'b37' or 'b38'.") if (not (options.clinvarfile or options.clinvardir) or (options.clinvarfile and options.clinvardir)): sys.stderr.write("Please provide either a ClinVar file or directory.") parser.print_help() sys.exit(1) if options.clinvarfile: clinvarfilename = options.clinvarfile elif options.clinvardir: clinvarfilename = get_latest_vcf_file(target_dir=options.clinvardir, build=build) if clinvarfilename.endswith('.vcf'): input_clinvar_file = open(options.clinvarfile) elif clinvarfilename.endswith('.vcf.gz'): input_clinvar_file = gzip.open(clinvarfilename) elif clinvarfilename.endswith('.vcf.bz2'): input_clinvar_file = bz2.BZ2File(clinvarfilename) else: raise IOError("ClinVar filename expected to end with '.vcf'," + " '.vcf.gz', or '.vcf.bz2'.") if options.type not in ['csv', 'json']: raise IOError("Not a valid report type, must be 'csv' or 'json'.") if options.type == "csv": csv_report(input_genome_file=input_genome_file, input_clinvar_file=input_clinvar_file, build=build, version=version) elif options.type == "json": notes_json = {} if options.notes: notes_json["parameter"] = options.notes try: notes_json = json.loads(options.notes) except: sys.stderr.write("Could not parse JSON notes field\n") json_report(input_genome_file=input_genome_file, input_clinvar_file=input_clinvar_file, build=build, notes=notes_json, version=version)