예제 #1
0
def export_to_es(rows, input_dataset, project_guid, es_host, es_port,
                 block_size, num_shards):
    meta = {
        'genomeVersion': '38',
        'sampleType': WGS_SAMPLE_TYPE,
        'datasetType': 'SV',
        'sourceFilePath': input_dataset,
    }

    index_name = get_es_index_name(project_guid, meta)

    rows = rows.annotate_globals(**meta)

    es_password = os.environ.get('PIPELINE_ES_PASSWORD', '')
    es_client = ElasticsearchClient(host=es_host,
                                    port=es_port,
                                    es_password=es_password)

    es_client.export_table_to_elasticsearch(
        rows,
        index_name=index_name,
        block_size=block_size,
        num_shards=num_shards,
        delete_index_before_exporting=True,
        export_globals_to_index_meta=True,
        verbose=True,
    )
예제 #2
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        if self.es_index != self.es_index.lower():
            raise Exception(
                f"Invalid es_index name [{self.es_index}], must be lowercase")

        self._es = ElasticsearchClient(host=self.es_host,
                                       port=self.es_port,
                                       es_username=self.es_username,
                                       es_password=self.es_password)
예제 #3
0
def export_table_to_elasticsearch(table_url,
                                  host,
                                  index_name,
                                  index_type,
                                  port=9200,
                                  num_shards=1,
                                  block_size=200):
    ds = hl.read_table(table_url)

    es = ElasticsearchClient(host, port)
    es.export_table_to_elasticsearch(
        ds,
        index_name=index_name,
        index_type_name=index_type,
        block_size=block_size,
        num_shards=num_shards,
        delete_index_before_exporting=True,
        export_globals_to_index_meta=True,
        verbose=True,
    )
def update_all_datasets(hc, args):
    client = ElasticsearchClient(host=args.host, port=args.port)
    indices = client.es.cat.indices(h="index", s="index").strip().split("\n")
    for i, index_name in enumerate(indices):
        _meta = client.get_index_meta(index_name)

        logger.info("==> updating index {} out of {}: {}".format(
            i + 1, len(indices), index_name))
        if _meta and "sourceFilePath" in _meta:
            logger.info(
                "==> skipping {} because index _meta['sourceFilePath'] isn't set: {}"
                .format(index_name, _meta))
            try:
                update_dataset(index_name, args)
            except Exception as e:
                logger.error("ERROR while updating %s - %s: %s", index_name,
                             _meta["sourceFilePath"], e)
        else:
            logger.info(
                "==> skipping {} because index _meta['sourceFilePath'] isn't set: {}"
                .format(index_name, _meta))
예제 #5
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self._es = ElasticsearchClient(host=self.es_host, port=self.es_port)
예제 #6
0
class HailElasticSearchTask(luigi.Task):
    """
    Loads a MT to ES (TODO).
    """
    source_path = luigi.OptionalParameter(default=None)
    use_temp_loading_nodes = luigi.BoolParameter(
        default=True, description='Whether to use temporary loading nodes.')
    es_host = luigi.Parameter(description='ElasticSearch host.',
                              default='localhost')
    es_port = luigi.IntParameter(description='ElasticSearch port.',
                                 default=9200)
    es_index = luigi.Parameter(description='ElasticSearch index.',
                               default='data')

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._es = ElasticsearchClient(host=self.es_host, port=self.es_port)

    def requires(self):
        return [VcfFile(filename=self.source_path)]

    def run(self):
        mt = self.import_mt()
        # TODO: Load into ES

    def import_mt(self):
        return hl.read_matrix_table(self.input()[0].path)

    def export_table_to_elasticsearch(self, table):
        func_to_run_after_index_exists = None if not self.use_temp_loading_nodes else \
            lambda: self.route_index_to_temp_es_cluster(True)
        self._es.export_table_to_elasticsearch(
            table,
            index_name=self.es_index,
            func_to_run_after_index_exists=func_to_run_after_index_exists,
            elasticsearch_mapping_id="docId",
            write_null_values=True)

    def cleanup(self):
        self.route_index_to_temp_es_cluster(False)

    def route_index_to_temp_es_cluster(self, to_temp_loading):
        """Apply shard allocation filtering rules for the given index to elasticsearch data nodes with *loading* in
        their name:

        If to_temp_loading is True, route new documents in the given index only to nodes named "*loading*".
        Otherwise, move any shards in this index off of nodes named "*loading*"

        Args:
            to_temp_loading (bool): whether to route shards in the given index to the "*loading*" nodes, or move
            shards off of these nodes.
        """
        if to_temp_loading:
            require_name = "es-data-loading*"
            exclude_name = ""
        else:
            require_name = ""
            exclude_name = "es-data-loading*"

        body = {
            "index.routing.allocation.require._name": require_name,
            "index.routing.allocation.exclude._name": exclude_name
        }

        logger.info("==> Setting {}* settings = {}".format(
            self.es_index, body))

        index_arg = "{}*".format(self.es_index)
        self._es.es.indices.put_settings(index=index_arg, body=body)
예제 #7
0
class HailElasticSearchTask(luigi.Task):
    """
    Loads a MT to ES (TODO).
    """
    source_path = luigi.OptionalParameter(default=None)
    use_temp_loading_nodes = luigi.BoolParameter(
        default=True, description='Whether to use temporary loading nodes.')
    es_host = luigi.Parameter(description='ElasticSearch host.',
                              default='localhost')
    es_port = luigi.IntParameter(description='ElasticSearch port.',
                                 default=9200)
    es_index = luigi.Parameter(description='ElasticSearch index.',
                               default='data')
    es_username = luigi.Parameter(description='ElasticSearch username.',
                                  default='pipeline')
    es_password = luigi.Parameter(description='ElasticSearch password.',
                                  visibility=ParameterVisibility.PRIVATE,
                                  default=None)
    es_index_min_num_shards = luigi.IntParameter(
        default=1,
        description='Number of shards for the index will be the greater of '
        'this value and a calculated value based on the matrix.')

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        if self.es_index != self.es_index.lower():
            raise Exception(
                f"Invalid es_index name [{self.es_index}], must be lowercase")

        self._es = ElasticsearchClient(host=self.es_host,
                                       port=self.es_port,
                                       es_username=self.es_username,
                                       es_password=self.es_password)

    def requires(self):
        return [VcfFile(filename=self.source_path)]

    def run(self):
        mt = self.import_mt()
        # TODO: Load into ES

    def import_mt(self):
        return hl.read_matrix_table(self.input()[0].path)

    def export_table_to_elasticsearch(self, table, num_shards):
        func_to_run_after_index_exists = None if not self.use_temp_loading_nodes else \
            lambda: self._es.route_index_to_temp_es_cluster(self.es_index)
        self._es.export_table_to_elasticsearch(
            table,
            index_name=self.es_index,
            func_to_run_after_index_exists=func_to_run_after_index_exists,
            elasticsearch_mapping_id="docId",
            num_shards=num_shards,
            write_null_values=True)

    def cleanup(self):
        self._es.route_index_off_temp_es_cluster(self.es_index)

    def _mt_num_shards(self, mt):
        # The greater of the user specified min shards and calculated based on the variants and samples
        denominator = 1.4 * 10**9
        calculated_num_shards = math.ceil(
            (mt.count_rows() * mt.count_cols()) / denominator)
        return max(self.es_index_min_num_shards, calculated_num_shards)
예제 #8
0
    transcript_consequence_terms=get_expr_for_vep_consequence_terms_set(
        vep_transcript_consequences_root=mt.sortedTranscriptConsequences),
    transcript_ids=get_expr_for_vep_transcript_ids_set(
        vep_transcript_consequences_root=mt.sortedTranscriptConsequences),
    transcript_id_to_consequence_json=
    get_expr_for_vep_transcript_id_to_consequence_map(
        vep_transcript_consequences_root=mt.sortedTranscriptConsequences),
    variant_id=get_expr_for_variant_id(mt),
    xpos=get_expr_for_xpos(mt.locus),
)

print("\n=== Summary ===")
hl.summarize_variants(mt)

# Drop key columns for export
rows = mt.rows()
rows = rows.order_by(rows.variant_id).drop("locus", "alleles")

print("\n=== Exporting to Elasticsearch ===")
es = ElasticsearchClient(args.host, args.port)
es.export_table_to_elasticsearch(
    rows,
    index_name=index_name,
    index_type_name=args.index_type,
    block_size=args.es_block_size,
    num_shards=args.num_shards,
    delete_index_before_exporting=True,
    export_globals_to_index_meta=True,
    verbose=True,
)
def update_dataset(index_name, args):
    elasticsearch_client = ElasticsearchClient(host=args.host, port=args.port)
    _meta = elasticsearch_client.get_index_meta(index_name)
    if not args.dataset_path and (not _meta or "sourceFilePath" not in _meta):
        raise ValueError(
            "Couldn't update reference data in {} because it doesn't have a recorded sourceFilePath. Please use "
            "--index-name, --dataset-path, and --genome-version to update this index."
        )

    dataset_path = args.dataset_path or _meta["sourceFilePath"]
    genome_version = args.genome_version or _meta.get("genomeVersion")

    if genome_version is None:
        match = re.search("__grch([0-9]+)__", index_name, re.IGNORECASE)
        if not match:
            raise ValueError(
                "ERROR: couldn't update clinvar in {} because the genome version wasn't found in "
                "_meta ({}) or in the index name.".format(index_name, _meta))
        genome_version = match.group(1)

    print('dataset_path: %s' % dataset_path)

    # Import the VCFs from inputs. Set min partitions so that local pipeline execution takes advantage of all CPUs.
    mt = hl.import_vcf(dataset_path,
                       reference_genome='GRCh' + genome_version,
                       force_bgz=True,
                       min_partitions=500)

    # When input VCF having a bad property (duplicated loci) we need to run this line
    mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by(
        'locus', 'alleles')

    mt = hl.split_multi_hts(
        mt.annotate_rows(locus_old=mt.locus, alleles_old=mt.alleles))

    if args.update_clinvar:
        clinvar = hl.read_table(clinvar_ht_path)
        mt = CLINVARSchema(mt, clinvar_data=clinvar).clinvar()

    if args.update_hgmd:
        hgmd = hl.read_table(hgmd_ht_path)
        mt = HGMDSchema(mt, hgmd_data=hgmd).hgmd()

    if args.update_cidr:
        cidr = hl.read_table(cidr_ht_path)
        mt = CIDRSchema(mt, cidr_data=cidr).cidr()

    mt = mt.aIndex().doc_id()
    mt = mt.select_annotated_mt()

    variant_count = mt.count_rows()
    logger.info(
        "\n==> exporting {} variants to elasticsearch:".format(variant_count))

    row_table = mt.rows().flatten()
    row_table = row_table.drop(row_table.locus, row_table.alleles)

    elasticsearch_client.export_table_to_elasticsearch(
        row_table,
        index_name=index_name,
        index_type_name="variant",
        block_size=args.block_size,
        elasticsearch_write_operation=ELASTICSEARCH_UPDATE,
        elasticsearch_mapping_id="docId",
        verbose=False,
        delete_index_before_exporting=False,
        ignore_elasticsearch_write_errors=False,
        export_globals_to_index_meta=True,
    )