def export_to_es(rows, input_dataset, project_guid, es_host, es_port, block_size, num_shards): meta = { 'genomeVersion': '38', 'sampleType': WGS_SAMPLE_TYPE, 'datasetType': 'SV', 'sourceFilePath': input_dataset, } index_name = get_es_index_name(project_guid, meta) rows = rows.annotate_globals(**meta) es_password = os.environ.get('PIPELINE_ES_PASSWORD', '') es_client = ElasticsearchClient(host=es_host, port=es_port, es_password=es_password) es_client.export_table_to_elasticsearch( rows, index_name=index_name, block_size=block_size, num_shards=num_shards, delete_index_before_exporting=True, export_globals_to_index_meta=True, verbose=True, )
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if self.es_index != self.es_index.lower(): raise Exception( f"Invalid es_index name [{self.es_index}], must be lowercase") self._es = ElasticsearchClient(host=self.es_host, port=self.es_port, es_username=self.es_username, es_password=self.es_password)
def export_table_to_elasticsearch(table_url, host, index_name, index_type, port=9200, num_shards=1, block_size=200): ds = hl.read_table(table_url) es = ElasticsearchClient(host, port) es.export_table_to_elasticsearch( ds, index_name=index_name, index_type_name=index_type, block_size=block_size, num_shards=num_shards, delete_index_before_exporting=True, export_globals_to_index_meta=True, verbose=True, )
def update_all_datasets(hc, args): client = ElasticsearchClient(host=args.host, port=args.port) indices = client.es.cat.indices(h="index", s="index").strip().split("\n") for i, index_name in enumerate(indices): _meta = client.get_index_meta(index_name) logger.info("==> updating index {} out of {}: {}".format( i + 1, len(indices), index_name)) if _meta and "sourceFilePath" in _meta: logger.info( "==> skipping {} because index _meta['sourceFilePath'] isn't set: {}" .format(index_name, _meta)) try: update_dataset(index_name, args) except Exception as e: logger.error("ERROR while updating %s - %s: %s", index_name, _meta["sourceFilePath"], e) else: logger.info( "==> skipping {} because index _meta['sourceFilePath'] isn't set: {}" .format(index_name, _meta))
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._es = ElasticsearchClient(host=self.es_host, port=self.es_port)
class HailElasticSearchTask(luigi.Task): """ Loads a MT to ES (TODO). """ source_path = luigi.OptionalParameter(default=None) use_temp_loading_nodes = luigi.BoolParameter( default=True, description='Whether to use temporary loading nodes.') es_host = luigi.Parameter(description='ElasticSearch host.', default='localhost') es_port = luigi.IntParameter(description='ElasticSearch port.', default=9200) es_index = luigi.Parameter(description='ElasticSearch index.', default='data') def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._es = ElasticsearchClient(host=self.es_host, port=self.es_port) def requires(self): return [VcfFile(filename=self.source_path)] def run(self): mt = self.import_mt() # TODO: Load into ES def import_mt(self): return hl.read_matrix_table(self.input()[0].path) def export_table_to_elasticsearch(self, table): func_to_run_after_index_exists = None if not self.use_temp_loading_nodes else \ lambda: self.route_index_to_temp_es_cluster(True) self._es.export_table_to_elasticsearch( table, index_name=self.es_index, func_to_run_after_index_exists=func_to_run_after_index_exists, elasticsearch_mapping_id="docId", write_null_values=True) def cleanup(self): self.route_index_to_temp_es_cluster(False) def route_index_to_temp_es_cluster(self, to_temp_loading): """Apply shard allocation filtering rules for the given index to elasticsearch data nodes with *loading* in their name: If to_temp_loading is True, route new documents in the given index only to nodes named "*loading*". Otherwise, move any shards in this index off of nodes named "*loading*" Args: to_temp_loading (bool): whether to route shards in the given index to the "*loading*" nodes, or move shards off of these nodes. """ if to_temp_loading: require_name = "es-data-loading*" exclude_name = "" else: require_name = "" exclude_name = "es-data-loading*" body = { "index.routing.allocation.require._name": require_name, "index.routing.allocation.exclude._name": exclude_name } logger.info("==> Setting {}* settings = {}".format( self.es_index, body)) index_arg = "{}*".format(self.es_index) self._es.es.indices.put_settings(index=index_arg, body=body)
class HailElasticSearchTask(luigi.Task): """ Loads a MT to ES (TODO). """ source_path = luigi.OptionalParameter(default=None) use_temp_loading_nodes = luigi.BoolParameter( default=True, description='Whether to use temporary loading nodes.') es_host = luigi.Parameter(description='ElasticSearch host.', default='localhost') es_port = luigi.IntParameter(description='ElasticSearch port.', default=9200) es_index = luigi.Parameter(description='ElasticSearch index.', default='data') es_username = luigi.Parameter(description='ElasticSearch username.', default='pipeline') es_password = luigi.Parameter(description='ElasticSearch password.', visibility=ParameterVisibility.PRIVATE, default=None) es_index_min_num_shards = luigi.IntParameter( default=1, description='Number of shards for the index will be the greater of ' 'this value and a calculated value based on the matrix.') def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if self.es_index != self.es_index.lower(): raise Exception( f"Invalid es_index name [{self.es_index}], must be lowercase") self._es = ElasticsearchClient(host=self.es_host, port=self.es_port, es_username=self.es_username, es_password=self.es_password) def requires(self): return [VcfFile(filename=self.source_path)] def run(self): mt = self.import_mt() # TODO: Load into ES def import_mt(self): return hl.read_matrix_table(self.input()[0].path) def export_table_to_elasticsearch(self, table, num_shards): func_to_run_after_index_exists = None if not self.use_temp_loading_nodes else \ lambda: self._es.route_index_to_temp_es_cluster(self.es_index) self._es.export_table_to_elasticsearch( table, index_name=self.es_index, func_to_run_after_index_exists=func_to_run_after_index_exists, elasticsearch_mapping_id="docId", num_shards=num_shards, write_null_values=True) def cleanup(self): self._es.route_index_off_temp_es_cluster(self.es_index) def _mt_num_shards(self, mt): # The greater of the user specified min shards and calculated based on the variants and samples denominator = 1.4 * 10**9 calculated_num_shards = math.ceil( (mt.count_rows() * mt.count_cols()) / denominator) return max(self.es_index_min_num_shards, calculated_num_shards)
transcript_consequence_terms=get_expr_for_vep_consequence_terms_set( vep_transcript_consequences_root=mt.sortedTranscriptConsequences), transcript_ids=get_expr_for_vep_transcript_ids_set( vep_transcript_consequences_root=mt.sortedTranscriptConsequences), transcript_id_to_consequence_json= get_expr_for_vep_transcript_id_to_consequence_map( vep_transcript_consequences_root=mt.sortedTranscriptConsequences), variant_id=get_expr_for_variant_id(mt), xpos=get_expr_for_xpos(mt.locus), ) print("\n=== Summary ===") hl.summarize_variants(mt) # Drop key columns for export rows = mt.rows() rows = rows.order_by(rows.variant_id).drop("locus", "alleles") print("\n=== Exporting to Elasticsearch ===") es = ElasticsearchClient(args.host, args.port) es.export_table_to_elasticsearch( rows, index_name=index_name, index_type_name=args.index_type, block_size=args.es_block_size, num_shards=args.num_shards, delete_index_before_exporting=True, export_globals_to_index_meta=True, verbose=True, )
def update_dataset(index_name, args): elasticsearch_client = ElasticsearchClient(host=args.host, port=args.port) _meta = elasticsearch_client.get_index_meta(index_name) if not args.dataset_path and (not _meta or "sourceFilePath" not in _meta): raise ValueError( "Couldn't update reference data in {} because it doesn't have a recorded sourceFilePath. Please use " "--index-name, --dataset-path, and --genome-version to update this index." ) dataset_path = args.dataset_path or _meta["sourceFilePath"] genome_version = args.genome_version or _meta.get("genomeVersion") if genome_version is None: match = re.search("__grch([0-9]+)__", index_name, re.IGNORECASE) if not match: raise ValueError( "ERROR: couldn't update clinvar in {} because the genome version wasn't found in " "_meta ({}) or in the index name.".format(index_name, _meta)) genome_version = match.group(1) print('dataset_path: %s' % dataset_path) # Import the VCFs from inputs. Set min partitions so that local pipeline execution takes advantage of all CPUs. mt = hl.import_vcf(dataset_path, reference_genome='GRCh' + genome_version, force_bgz=True, min_partitions=500) # When input VCF having a bad property (duplicated loci) we need to run this line mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by( 'locus', 'alleles') mt = hl.split_multi_hts( mt.annotate_rows(locus_old=mt.locus, alleles_old=mt.alleles)) if args.update_clinvar: clinvar = hl.read_table(clinvar_ht_path) mt = CLINVARSchema(mt, clinvar_data=clinvar).clinvar() if args.update_hgmd: hgmd = hl.read_table(hgmd_ht_path) mt = HGMDSchema(mt, hgmd_data=hgmd).hgmd() if args.update_cidr: cidr = hl.read_table(cidr_ht_path) mt = CIDRSchema(mt, cidr_data=cidr).cidr() mt = mt.aIndex().doc_id() mt = mt.select_annotated_mt() variant_count = mt.count_rows() logger.info( "\n==> exporting {} variants to elasticsearch:".format(variant_count)) row_table = mt.rows().flatten() row_table = row_table.drop(row_table.locus, row_table.alleles) elasticsearch_client.export_table_to_elasticsearch( row_table, index_name=index_name, index_type_name="variant", block_size=args.block_size, elasticsearch_write_operation=ELASTICSEARCH_UPDATE, elasticsearch_mapping_id="docId", verbose=False, delete_index_before_exporting=False, ignore_elasticsearch_write_errors=False, export_globals_to_index_meta=True, )