def export_table_to_elasticsearch(table, host, index_name, block_size=5000, id_field=None, mapping=None, num_shards=10, port=9200, verbose=True): es_client = elasticsearch.Elasticsearch(host, port=port) if not mapping: mapping = elasticsearch_mapping_for_table(table) # Delete the index before creating it if es_client.indices.exists(index=index_name): es_client.indices.delete(index=index_name) mapping["_meta"] = dict(hl.eval(table.globals)) # https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules.html#index-modules-settings request_body = { "mappings": mapping, "settings": { "index.codec": "best_compression", "index.mapping.total_fields.limit": 10000, "index.number_of_replicas": 0, "index.number_of_shards": num_shards, "index.refresh_interval": -1, }, } es_client.indices.create(index=index_name, body=request_body) temp_file = "table-tmp.json.txt" table = table.key_by() table.select(json=hl.json(table.row_value)).export(temp_file, header=False) buffer = [] with open(temp_file) as f: for line in f: data = json.loads(line) buffer.append(data) if len(buffer) >= block_size: helpers.bulk(es_client, build_bulk_request(buffer, index_name, id_field)) buffer = [] if buffer: helpers.bulk(es_client, build_bulk_request(buffer, index_name, id_field)) buffer = [] es_client.indices.forcemerge(index=index_name)
def test(self): schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32), g=hl.tarray( hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)), h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr), i=hl.tbool, j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)) rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'g': [hl.Struct(x=1, y=5, z='banana')], 'h': hl.Struct(a=5, b=3, c='winter'), 'i': True, 'j': hl.Struct(x=3, y=2, z='summer')}] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict(kt.annotate( chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d), ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5), dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])), dpois=hl.dpois(4, kt.a), drop=kt.h.drop('b', 'c'), exp=hl.exp(kt.c), fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d), hwe=hl.hardy_weinberg_p(1, 2, 1), index=hl.index(kt.g, 'z'), is_defined=hl.is_defined(kt.i), is_missing=hl.is_missing(kt.i), is_nan=hl.is_nan(hl.float64(kt.a)), json=hl.json(kt.g), log=hl.log(kt.a, kt.b), log10=hl.log10(kt.c), or_else=hl.or_else(kt.a, 5), or_missing=hl.or_missing(kt.i, kt.j), pchisqtail=hl.pchisqtail(kt.a, kt.b), pcoin=hl.rand_bool(0.5), pnorm=hl.pnorm(0.2), pow=2.0 ** kt.b, ppois=hl.ppois(kt.a, kt.b), qchisqtail=hl.qchisqtail(kt.a, kt.b), range=hl.range(0, 5, kt.b), rnorm=hl.rand_norm(0.0, kt.b), rpois=hl.rand_pois(kt.a), runif=hl.rand_unif(kt.b, kt.a), select=kt.h.select('c', 'b'), sqrt=hl.sqrt(kt.a), to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)], where=hl.cond(kt.i, 5, 10) ).take(1)[0])
def main(args): hl.init(default_reference='GRCh38', log='/variant_histograms.log') ht = hl.read_table(release_ht_path()) # NOTE: histogram aggregations are done on the entire callset (not just PASS variants), on raw data hist_dict = ANNOTATIONS_HISTS hist_dict['MQ'] = ( 20, 60, 40 ) # Boundaries changed for v3, but could be a good idea to settle on a standard hist_ranges_expr = get_annotations_hists(ht, ANNOTATIONS_HISTS) # NOTE: run the following code in a first pass to determine bounds for metrics # Evaluate minimum and maximum values for each metric of interest # This doesn't need to be run unless the defaults do not result in nice-looking histograms. if args.first_pass: minmax_dict = {} for metric in hist_ranges_expr.keys(): minmax_dict[metric] = hl.struct(min=hl.agg.min(ht[metric]), max=hl.if_else( hl.agg.max(ht[metric]) < 1e10, hl.agg.max(ht[metric]), 1e10)) minmax = ht.aggregate(hl.struct(**minmax_dict)) print(minmax) else: # Aggregate hists over hand-tooled ranges hists = ht.aggregate(hl.array([ hist_expr.annotate(metric=hist_metric) for hist_metric, hist_expr in hist_ranges_expr.items() ]).extend( hl.array( hl.agg.group_by( create_frequency_bins_expr(AC=ht.freq[1].AC, AF=ht.freq[1].AF), hl.agg.hist( hl.log10(ht.info.QUALapprox), 1, 10, 36))).map(lambda x: x[1].annotate(metric=x[0]))), _localize=False) with hl.hadoop_open(qual_hists_json_path(CURRENT_RELEASE), 'w') as f: f.write(hl.eval(hl.json(hists)))
def main(args): hl.init(default_reference="GRCh38", log="/variant_histograms.log") logger.info("Loading ANNOTATIONS_HISTS dictionary...") if not file_exists(annotation_hists_path()): raise DataException( "Annotation hists JSON file not found. Need to create this JSON before running script!" ) with hl.hadoop_open(annotation_hists_path()) as a: ANNOTATIONS_HISTS = json.loads(a.read()) # NOTE: histogram aggregations on these metrics are done on the entire callset (not just PASS variants), on raw data ht = hl.read_table(release_ht_path(public=False)) ht = ht.select(freq=ht.freq, info=ht.info.select(*ANNOTATIONS_HISTS)) inbreeding_bin_ranges = ANNOTATIONS_HISTS["InbreedingCoeff"] # Remove InbreedingCoeff from ANNOTATIONS_HISTS. It requires different ranges by allele frequency and needs to be # handled differently. It is stored as a dictionary in annotation_hists_path ANNOTATIONS_HISTS.remove("InbreedingCoeff") logger.info("Getting info annotation histograms...") hist_ranges_expr = get_annotations_hists(ht, ANNOTATIONS_HISTS, LOG10_ANNOTATIONS) # Evaluate minimum and maximum values for each metric of interest to help determine the bounds of the hists # NOTE: Run this first, then update values in annotation_hists_path JSON as necessary if args.determine_bounds: logger.info( "Evaluating minimum and maximum values for each metric of interest. Maximum values capped at 1e10." ) minmax_dict = {} for metric in ANNOTATIONS_HISTS: minmax_dict[metric] = hl.struct( min=hl.agg.min(ht.info[metric]), max=hl.if_else( hl.agg.max(ht.info[metric]) < 1e10, hl.agg.max(ht.info[metric]), 1e10, ), ) minmax = ht.aggregate(hl.struct(**minmax_dict)) logger.info(f"Metrics bounds: {minmax}") else: logger.info( "Aggregating hists over ranges defined in the annotation_hists_path JSON file. --determine_bounds can " "be used to help define these ranges..." ) hists = ht.aggregate( hl.array( [ hist_expr.annotate(metric=hist_metric) for hist_metric, hist_expr in hist_ranges_expr.items() ] ) .extend( hl.array( hl.agg.group_by( create_frequency_bins_expr(AC=ht.freq[1].AC, AF=ht.freq[1].AF), hl.agg.hist( hl.log10(ht.info.QUALapprox), *ANNOTATIONS_HISTS["QUALapprox"], ), ) ).map(lambda x: x[1].annotate(metric="QUALapprox-" + x[0])) ) .extend( hl.array( hl.agg.group_by( create_frequency_bins_expr(AC=ht.freq[1].AC, AF=ht.freq[1].AF), hl.agg.hist( hl.log10(ht.info.AS_QUALapprox), *ANNOTATIONS_HISTS["AS_QUALapprox"], ), ) ).map(lambda x: x[1].annotate(metric="AS_QUALapprox-" + x[0])) ), _localize=False, ) # Defining hist range and bins for allele frequency groups because they needed different ranges ht = ht.annotate(af_bin=create_frequency_bins_expr_inbreeding(AF=ht.freq[1].AF)) inbreeding_hists = [ ht.aggregate( hl.agg.filter( ht.af_bin == x, hl.agg.hist(ht.info.InbreedingCoeff, *inbreeding_bin_ranges[x],), ) ).annotate(metric="InbreedingCoeff" + "-" + x) for x in inbreeding_bin_ranges ] hists = hl.eval(hl.json(hists)) inbreeding_hists = hl.eval(hl.json(inbreeding_hists)) # Note: The following removes '}' from the JSON stored in hists and '{' from the JSON stored in # inbreeding_hists then joins them together to be written out as a single JSON hists = hists[:-1] + "," + inbreeding_hists[1:] logger.info("Writing output") with hl.hadoop_open(qual_hists_json_path(), "w") as f: f.write(hists)
def write_data_files(table_path, output_directory, genes=None): if output_directory.startswith("gs://"): raise Exception("Cannot write output to Google Storage") ds = hl.read_table(table_path) os.makedirs(output_directory, exist_ok=True) with open(f"{output_directory}/metadata.json", "w") as output_file: output_file.write(hl.eval(hl.json(ds.globals.meta))) gene_search_terms = ds.select(data=hl.json(hl.tuple([ds.gene_id, ds.search_terms]))) gene_search_terms.key_by().select("data").export(f"{output_directory}/gene_search_terms.json.txt", header=False) os.remove(f"{output_directory}/.gene_search_terms.json.txt.crc") ds = ds.drop("previous_symbols", "alias_symbols", "search_terms") os.makedirs(f"{output_directory}/results", exist_ok=True) for dataset in ds.globals.meta.datasets.dtype.fields: reference_genome = "GRCh38" if dataset == "bipex" else "GRCh37" gene_results = ds.filter(hl.is_defined(ds.gene_results[dataset])) gene_results = gene_results.select( result=hl.tuple( [ gene_results.gene_id, gene_results.symbol, gene_results.name, gene_results[reference_genome].chrom, (gene_results[reference_genome].start + gene_results[reference_genome].stop) // 2, gene_results.gene_results[dataset].group_results, ] ) ) gene_results = gene_results.collect() gene_results = [r.result for r in gene_results] with open(f"{output_directory}/results/{dataset.lower()}.json", "w") as output_file: output_file.write(json.dumps({"results": gene_results}, cls=ResultEncoder)) if genes: ds = ds.filter(hl.set(genes).contains(ds.gene_id)) temp_file_name = "temp.tsv" n_rows = ds.count() ds.select(data=hl.json(ds.row)).export(f"{output_directory}/{temp_file_name}", header=False) csv.field_size_limit(sys.maxsize) os.makedirs(f"{output_directory}/genes", exist_ok=True) with multiprocessing.get_context("spawn").Pool() as pool: with open(f"{output_directory}/{temp_file_name}") as data_file: reader = csv.reader(data_file, delimiter="\t") for gene_id, gene_grch37, gene_grch38, all_variants in tqdm(pool.imap(split_data, reader), total=n_rows): num = int(gene_id.lstrip("ENSGR")) gene_dir = f"{output_directory}/genes/{str(num % 1000).zfill(3)}" os.makedirs(gene_dir, exist_ok=True) if gene_grch37: with open(f"{gene_dir}/{gene_id}_GRCh37.json", "w") as out_file: out_file.write(gene_grch37) if gene_grch38: with open(f"{gene_dir}/{gene_id}_GRCh38.json", "w") as out_file: out_file.write(gene_grch38) for dataset, dataset_variants in all_variants.items(): if dataset_variants: with open(f"{gene_dir}/{gene_id}_{dataset.lower()}_variants.json", "w") as out_file: out_file.write(dataset_variants) os.remove(f"{output_directory}/{temp_file_name}") os.remove(f"{output_directory}/.{temp_file_name}.crc")