def export_ht_to_es(ht, host='172.23.117.23', port=9200, index_name='pcgc_chr20_test', index_type='variant', es_block_size=200, num_shards=1): es = ElasticsearchClient(host, port) es.export_table_to_elasticsearch( ht, index_name=index_name, index_type_name=index_type, block_size=es_block_size, num_shards=num_shards, delete_index_before_exporting=True, export_globals_to_index_meta=True, verbose=True, )
path_rare_variants = 'gs://schizophrenia-browser/171211/2017-12-11_release-v1-browser-variant-in-schema.kt' kt_pop = hc.read_table(path_pop) kt_annotations = hc.read_table(path_annotations) kt_rare_variants = hc.read_table(path_rare_variants) pprint(kt_pop.schema) pprint(kt_annotations.schema) pprint(kt_rare_variants.schema) ES_HOST_IP = '10.4.0.13' ES_HOST_PORT = 9200 print("======== Export to elasticsearch ======") es = ElasticsearchClient( host=ES_HOST_IP, port=ES_HOST_PORT, ) annotation_expressions = [ 'variant_id = %s' % get_expr_for_variant_id(), 'chrom = %s' % get_expr_for_contig(), 'pos = %s' % get_expr_for_start_pos(), "xpos = %s" % get_expr_for_xpos(field_prefix="", pos_field="pos"), ] for expression in annotation_expressions: kt_rare_variants = kt_rare_variants.annotate(expression) kt_rare_variants = kt_rare_variants.drop(['v']) kt_annotations = kt_annotations.annotate('variantId = %s' % get_expr_for_variant_id()).drop(['v'])
kt_variants = kt_variant_results.key_by('v').join(kt_variant_annotation.key_by('v')) kt_variants = kt_variants.drop(['v']) kt_results_by_cohort = kt_results_by_cohort.rename(column_map) for expression in annotation_expressions: kt_results_by_cohort = kt_results_by_cohort.annotate(expression) kt_results_by_cohort = kt_results_by_cohort.drop(['v']) ES_HOST_IP = '10.4.0.13' ES_HOST_PORT = 9200 print("======== Export to elasticsearch ======") es = ElasticsearchClient( host=ES_HOST_IP, port=ES_HOST_PORT, ) es.export_kt_to_elasticsearch( kt_variants, index_name='schizophrenia_exome_variants_results_180512', index_type_name='schizophrenia_exome_variant', block_size=1000, num_shards=2, delete_index_before_exporting=True, verbose=True, ) es.export_kt_to_elasticsearch( kt_results_by_cohort, index_name='schizophrenia_exome_variants_groups_180512',
def export_to_elasticsearch( host, port, vds, index_name, args, operation=ELASTICSEARCH_INDEX, delete_index_before_exporting=False, export_genotypes=True, disable_doc_values_for_fields=(), disable_index_for_fields=(), export_snapshot_to_google_bucket=False, start_with_sample_group=0, ): """Utility method for exporting the given vds to an elasticsearch index. """ logger.info("Input: " + input_path) index_type = "variant" if export_genotypes: if args.analysis_type == "GATK_VARIANTS": genotype_fields_to_export = DEFAULT_GENOTYPE_FIELDS_TO_EXPORT genotype_field_to_elasticsearch_type_map = DEFAULT_GENOTYPE_FIELD_TO_ELASTICSEARCH_TYPE_MAP elif args.analysis_type in ["MANTA_SVS", "JULIA_SVS"]: genotype_fields_to_export = [ 'num_alt = if(g.GT.isCalled()) g.GT.nNonRefAlleles() else -1', #'genotype_filter = g.FT', #'gq = g.GQ', 'dp = if(g.GT.isCalled()) [g.PR.sum + g.SR.sum, ' + ELASTICSEARCH_MAX_SIGNED_SHORT_INT_TYPE + '].min() else NA:Int', 'ab = let total=g.PR.sum + g.SR.sum in if(g.GT.isCalled() && total != 0) ((g.PR[1] + g.SR[1]) / total).toFloat else NA:Float', 'ab_PR = let total=g.PR.sum in if(g.GT.isCalled() && total != 0) (g.PR[1] / total).toFloat else NA:Float', 'ab_SR = let total=g.SR.sum in if(g.GT.isCalled() && total != 0) (g.SR[1] / total).toFloat else NA:Float', 'dp_PR = if(g.GT.isCalled()) [g.PR.sum,' + ELASTICSEARCH_MAX_SIGNED_SHORT_INT_TYPE + '].min() else NA:Int', 'dp_SR = if(g.GT.isCalled()) [g.SR.sum,' + ELASTICSEARCH_MAX_SIGNED_SHORT_INT_TYPE + '].min() else NA:Int', ] genotype_field_to_elasticsearch_type_map = { ".*_num_alt": { "type": "byte", "doc_values": "false" }, #".*_genotype_filter": {"type": "keyword", "doc_values": "false"}, #".*_gq": {"type": "short", "doc_values": "false"}, ".*_dp": { "type": "short", "doc_values": "false" }, ".*_ab": { "type": "half_float", "doc_values": "false" }, ".*_ab_PR": { "type": "half_float", "doc_values": "false" }, ".*_ab_SR": { "type": "half_float", "doc_values": "false" }, ".*_dp_PR": { "type": "short", "doc_values": "false" }, ".*_dp_SR": { "type": "short", "doc_values": "false" }, } else: raise ValueError("Unexpected args.analysis_type: %s" % args.analysis_type) else: genotype_fields_to_export = [] genotype_field_to_elasticsearch_type_map = {} vds = vds.persist() client = ElasticsearchClient(host, port) for i, sample_group in enumerate(sample_groups): if i < start_with_sample_group: continue #if delete_index_before_exporting and i < 4: # continue if len(sample_groups) > 1: vds_sample_subset = vds.filter_samples_list(sample_group, keep=True) current_index_name = "%s_%s" % (index_name, i) else: vds_sample_subset = vds current_index_name = index_name logger.info("==> exporting %s samples into %s" % (len(sample_group), current_index_name)) logger.info( "Samples: %s .. %s" % (", ".join(sample_group[:3]), ", ".join(sample_group[-3:]))) logger.info("==> export to elasticsearch") pprint(vds.variant_schema) timestamp1 = time.time() client.export_vds_to_elasticsearch( vds_sample_subset, genotype_fields_to_export=genotype_fields_to_export, genotype_field_to_elasticsearch_type_map= genotype_field_to_elasticsearch_type_map, index_name=current_index_name, index_type_name=index_type, block_size=args.block_size, num_shards=args.num_shards, delete_index_before_exporting=delete_index_before_exporting, elasticsearch_write_operation=operation, elasticsearch_mapping_id="docId", disable_doc_values_for_fields=disable_doc_values_for_fields, disable_index_for_fields=disable_index_for_fields, is_split_vds=True, verbose=True, ) timestamp2 = time.time() logger.info("==> finished exporting - time: %s seconds" % (timestamp2 - timestamp1)) if export_snapshot_to_google_bucket: logger.info("==> export snapshot to google bucket") client.create_elasticsearch_snapshot( index_name=index_name + "*", bucket="seqr-database-backups", base_path="elasticsearch/snapshots", snapshot_repo="callsets") client.save_index_operation_metadata( args.input_vds, index_name, args.genome_version, fam_file=args.fam_file, remap_sample_ids=args.remap_sample_ids, subset_samples=args.subset_samples, skip_vep=args.skip_vep, project_id=args.project_id, analysis_type=args.analysis_type, sample_type=args.sample_type, command=" ".join(sys.argv), directory=args.directory, username=args.username, operation="create_index", status="success", )
args = p.parse_args() es = elasticsearch.Elasticsearch(args.host, port=args.port) existing_indices = es.indices.get(index="*").keys() if args.index not in existing_indices: p.error("%s not found. Existing indices are: %s" % (args.index, existing_indices)) # see https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-snapshots.html snapshot_name = "snapshot_%s__%s" % (args.index.lower(), time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())) # see https://www.elastic.co/guide/en/elasticsearch/plugins/current/repository-gcs-repository.html print("==> Check if snapshot repo exists: %s" % args.repo) repo_info = es.snapshot.get_repository(repository=args.repo) pprint(repo_info) print("==> Creating snapshot in gs://%s/%s for index %s" % (args.bucket, args.base_path, args.index)) client = ElasticsearchClient(args.host, args.port) client.create_elasticsearch_snapshot( index_name = args.index + "*", bucket = args.bucket, base_path = args.base_path, snapshot_repo=args.repo) print("==> Getting snapshot status for: " + snapshot_name) pprint( es.snapshot.status(repository=args.repo) )
COVERAGE_PATHS = EXOME_COVERAGE_CSV_PATHS[-1] kt_coverage = hc.import_table(COVERAGE_PATHS, types=types) kt_coverage = kt_coverage.rename({ '#chrom': 'chrom', '1': 'over1', '5': 'over5', '10': 'over10', '15': 'over15', '20': 'over20', '25': 'over25', '30': 'over30', '50': 'over50', '100': 'over100', }) print(kt_coverage.schema) print("======== Export exome coverage to elasticsearch ======") es = ElasticsearchClient( host=args.host, port=args.port, ) es.export_kt_to_elasticsearch(kt_coverage, index_name=args.index, index_type_name=args.index_type, num_shards=args.num_shards, block_size=args.block_size, delete_index_before_exporting=True, verbose=True)
"category", ] for field_name in transcript_annotations_to_keep: new_field_name = field_name.split("_")[0] + "".join(map(lambda word: word.capitalize(), field_name.split("_")[1:])) kt = kt.annotate("%(new_field_name)s = mainTranscript.%(field_name)s" % locals()) kt = kt.drop(["mainTranscript"]) # pprint(kt.schema) DISABLE_INDEX_AND_DOC_VALUES_FOR_FIELDS = ("sortedTranscriptConsequences", ) print("======== Export to elasticsearch ======") es = ElasticsearchClient( host=args.host, port=args.port, ) es.export_kt_to_elasticsearch( kt, index_name=args.index, index_type_name=args.index_type, block_size=args.block_size, num_shards=args.num_shards, delete_index_before_exporting=True, disable_doc_values_for_fields=DISABLE_INDEX_AND_DOC_VALUES_FOR_FIELDS, disable_index_for_fields=DISABLE_INDEX_AND_DOC_VALUES_FOR_FIELDS, verbose=True, )
import os os.system( "pip install elasticsearch" ) # this used to be `import pip; pip.main(['install', 'elasticsearch']);`, but pip.main is deprecated as of pip v10 import argparse from utils.elasticsearch_client import ElasticsearchClient p = argparse.ArgumentParser() p.add_argument( "-H", "--host", help="elasticsearch client host. The default address works if " "`kubectl proxy` is running in the background.", default= "http://localhost:8001/api/v1/namespaces/default/services/elasticsearch:9200/proxy" ) p.add_argument("-p", "--port", help="elasticsearch client port.", default="30001") args = p.parse_args() # to get the ip address, run `kubectl describe pod elasticsearch-1019229749-vhghc` ELASTICSEARCH_HOST = args.host ELASTICSEARCH_PORT = args.port es = ElasticsearchClient(ELASTICSEARCH_HOST, port=ELASTICSEARCH_PORT) es.print_elasticsearch_stats()
else: vds_sample_subset = vds index_name = args.index_name logger.info("==> loading %s samples into %s" % (len(sample_group), index_name)) logger.info("Samples: %s .. %s" % (", ".join(sample_group[:3]), ", ".join(sample_group[-3:]))) logger.info("==> export to elasticsearch") DISABLE_INDEX_FOR_FIELDS = ("sortedTranscriptConsequences", ) DISABLE_DOC_VALUES_FOR_FIELDS = ("sortedTranscriptConsequences", ) timestamp1 = time.time() es = ElasticsearchClient( host=args.host, port=args.port, ) es.export_vds_to_elasticsearch( vds_sample_subset, genotype_fields_to_export=DEFAULT_GENOTYPE_FIELDS_TO_EXPORT, genotype_field_to_elasticsearch_type_map= DEFAULT_GENOTYPE_FIELD_TO_ELASTICSEARCH_TYPE_MAP, host=args.host, port=args.port, index_name=index_name, index_type_name=args.index_type, block_size=args.block_size, num_shards=args.num_shards, delete_index_before_exporting=True, disable_doc_values_for_fields=DISABLE_DOC_VALUES_FOR_FIELDS, disable_index_for_fields=DISABLE_INDEX_FOR_FIELDS,
if args.add_all_annotations or args.add_gnomad_coverage: logger.info("\n==> Add gnomad coverage") vds = vds.persist() vds = add_gnomad_exome_coverage_to_vds(hc, vds, args.genome_version, root="va.gnomad_exome_coverage") vds = add_gnomad_genome_coverage_to_vds(hc, vds, args.genome_version, root="va.gnomad_genome_coverage") vds = vds.persist() # write to VDS client = ElasticsearchClient(args.host, args.port) timestamp1 = time.time() if args.dry_run: logger.info("Dry run finished. Next step would be to export to index: " + str(index_name)) else: client.export_vds_to_elasticsearch( vds, genotype_fields_to_export=genotype_fields_to_export, genotype_field_to_elasticsearch_type_map= genotype_field_to_elasticsearch_type_map, index_name=index_name, index_type_name="variant", block_size=args.block_size, num_shards=args.num_shards,