def update_all_datasets(hc, args): client = ElasticsearchClient(args.host, port=args.port) indices = client.es.cat.indices(h="index", s="index").strip().split("\n") for i, index_name in enumerate(indices): _meta = client.get_index_meta(index_name) logger.info("==> updating index {} out of {}: {}".format( i + 1, len(indices), index_name)) if _meta and "sourceFilePath" in _meta: logger.info( "==> skipping {} because index _meta['sourceFilePath'] isn't set: {}" .format(index_name, _meta)) try: update_dataset(hc, index_name, args) except Exception as e: logger.error("ERROR while updating %s - %s: %s", index_name, _meta["sourceFilePath"], e) else: logger.info( "==> skipping {} because index _meta['sourceFilePath'] isn't set: {}" .format(index_name, _meta))
def update_dataset(hc, index_name, args): elasticsearch_client = ElasticsearchClient(args.host, args.port) _meta = elasticsearch_client.get_index_meta(index_name) if not args.dataset_path and (not _meta or "sourceFilePath" not in _meta): logger.error( "Couldn't update reference data in {} because it doesn't have a recorded sourceFilePath. Please use " "--index-name, --dataset-path, and --genome-version to update this index." .format(index_name)) return dataset_path = args.dataset_path or _meta["sourceFilePath"] genome_version = args.genome_version or _meta.get("genomeVersion") if genome_version is None: match = re.search("__grch([0-9]+)__", index_name, re.IGNORECASE) if not match: logger.info( "ERROR: couldn't update clinvar in {} because the genome version wasn't found in _meta ({}) or in the index name." .format(index_name, _meta)) return genome_version = match.group(1) vds = read_in_dataset(hc, dataset_path) vds = vds.drop_samples() vds = compute_minimal_schema(vds) vds = vds.annotate_global_expr( 'global.genomeVersion = "{}"'.format(genome_version)) # add reference data to vds filter_expr = [] if args.update_primate_ai: vds = add_primate_ai_to_vds(hc, vds, genome_version, root="va.primate_ai") filter_expr.append("isDefined(va.primate_ai.score)") if args.update_splice_ai: vds = add_splice_ai_to_vds(hc, vds, genome_version, root="va.splice_ai") filter_expr.append("isDefined(va.splice_ai.delta_score)") if args.update_clinvar: #vds = reset_clinvar_fields_in_vds(hc, vds, genome_version, root="va.clinvar", subset=filter_interval) vds = add_clinvar_to_vds(hc, vds, genome_version, root="va.clinvar") filter_expr.append("isDefined(va.clinvar.allele_id)") if args.update_hgmd: #vds = reset_hgmd_fields_in_vds(hc, vds, genome_version, root="va.hgmd", subset=filter_interval) vds = add_hgmd_to_vds(hc, vds, genome_version, root="va.hgmd") filter_expr.append("isDefined(va.hgmd.accession)") # filter down to variants that have reference data vds = vds.filter_variants_expr(" || ".join(filter_expr), keep=True) print("\n\n==> schema: ") pprint(vds.variant_schema) _, variant_count = vds.count() logger.info( "\n==> exporting {} variants to elasticsearch:".format(variant_count)) elasticsearch_client.export_vds_to_elasticsearch( vds, index_name=index_name, index_type_name="variant", block_size=args.block_size, elasticsearch_write_operation=ELASTICSEARCH_UPDATE, elasticsearch_mapping_id="docId", is_split_vds=True, verbose=False, delete_index_before_exporting=False, ignore_elasticsearch_write_errors=False, export_globals_to_index_meta=True, )