Пример #1
0
def export_ht_to_es(ht,
                    host='172.23.117.23',
                    port=9200,
                    index_name='pcgc_chr20_test',
                    index_type='variant',
                    es_block_size=200,
                    num_shards=1):

    es = ElasticsearchClient(host, port)

    es.export_table_to_elasticsearch(
        ht,
        index_name=index_name,
        index_type_name=index_type,
        block_size=es_block_size,
        num_shards=num_shards,
        delete_index_before_exporting=True,
        export_globals_to_index_meta=True,
        verbose=True,
    )
path_rare_variants = 'gs://schizophrenia-browser/171211/2017-12-11_release-v1-browser-variant-in-schema.kt'

kt_pop = hc.read_table(path_pop)
kt_annotations = hc.read_table(path_annotations)
kt_rare_variants = hc.read_table(path_rare_variants)

pprint(kt_pop.schema)
pprint(kt_annotations.schema)
pprint(kt_rare_variants.schema)

ES_HOST_IP = '10.4.0.13'
ES_HOST_PORT = 9200

print("======== Export to elasticsearch ======")
es = ElasticsearchClient(
    host=ES_HOST_IP,
    port=ES_HOST_PORT,
)

annotation_expressions = [
    'variant_id = %s' % get_expr_for_variant_id(),
    'chrom = %s' % get_expr_for_contig(),
    'pos = %s' % get_expr_for_start_pos(),
    "xpos = %s" % get_expr_for_xpos(field_prefix="", pos_field="pos"),
]

for expression in annotation_expressions:
    kt_rare_variants = kt_rare_variants.annotate(expression)

kt_rare_variants = kt_rare_variants.drop(['v'])

kt_annotations = kt_annotations.annotate('variantId = %s' % get_expr_for_variant_id()).drop(['v'])
Пример #3
0
kt_variants = kt_variant_results.key_by('v').join(kt_variant_annotation.key_by('v'))
kt_variants = kt_variants.drop(['v'])

kt_results_by_cohort = kt_results_by_cohort.rename(column_map)
for expression in annotation_expressions:
    kt_results_by_cohort = kt_results_by_cohort.annotate(expression)

kt_results_by_cohort = kt_results_by_cohort.drop(['v'])

ES_HOST_IP = '10.4.0.13'
ES_HOST_PORT = 9200

print("======== Export to elasticsearch ======")
es = ElasticsearchClient(
    host=ES_HOST_IP,
    port=ES_HOST_PORT,
)

es.export_kt_to_elasticsearch(
    kt_variants,
    index_name='schizophrenia_exome_variants_results_180512',
    index_type_name='schizophrenia_exome_variant',
    block_size=1000,
    num_shards=2,
    delete_index_before_exporting=True,
    verbose=True,
)

es.export_kt_to_elasticsearch(
    kt_results_by_cohort,
    index_name='schizophrenia_exome_variants_groups_180512',
Пример #4
0
def export_to_elasticsearch(
    host,
    port,
    vds,
    index_name,
    args,
    operation=ELASTICSEARCH_INDEX,
    delete_index_before_exporting=False,
    export_genotypes=True,
    disable_doc_values_for_fields=(),
    disable_index_for_fields=(),
    export_snapshot_to_google_bucket=False,
    start_with_sample_group=0,
):
    """Utility method for exporting the given vds to an elasticsearch index.
    """

    logger.info("Input: " + input_path)

    index_type = "variant"

    if export_genotypes:
        if args.analysis_type == "GATK_VARIANTS":
            genotype_fields_to_export = DEFAULT_GENOTYPE_FIELDS_TO_EXPORT
            genotype_field_to_elasticsearch_type_map = DEFAULT_GENOTYPE_FIELD_TO_ELASTICSEARCH_TYPE_MAP
        elif args.analysis_type in ["MANTA_SVS", "JULIA_SVS"]:
            genotype_fields_to_export = [
                'num_alt = if(g.GT.isCalled()) g.GT.nNonRefAlleles() else -1',
                #'genotype_filter = g.FT',
                #'gq = g.GQ',
                'dp = if(g.GT.isCalled()) [g.PR.sum + g.SR.sum, ' +
                ELASTICSEARCH_MAX_SIGNED_SHORT_INT_TYPE +
                '].min() else NA:Int',
                'ab = let total=g.PR.sum + g.SR.sum in if(g.GT.isCalled() && total != 0) ((g.PR[1] + g.SR[1]) / total).toFloat else NA:Float',
                'ab_PR = let total=g.PR.sum in if(g.GT.isCalled() && total != 0) (g.PR[1] / total).toFloat else NA:Float',
                'ab_SR = let total=g.SR.sum in if(g.GT.isCalled() && total != 0) (g.SR[1] / total).toFloat else NA:Float',
                'dp_PR = if(g.GT.isCalled()) [g.PR.sum,' +
                ELASTICSEARCH_MAX_SIGNED_SHORT_INT_TYPE +
                '].min() else NA:Int',
                'dp_SR = if(g.GT.isCalled()) [g.SR.sum,' +
                ELASTICSEARCH_MAX_SIGNED_SHORT_INT_TYPE +
                '].min() else NA:Int',
            ]

            genotype_field_to_elasticsearch_type_map = {
                ".*_num_alt": {
                    "type": "byte",
                    "doc_values": "false"
                },
                #".*_genotype_filter": {"type": "keyword", "doc_values": "false"},
                #".*_gq": {"type": "short", "doc_values": "false"},
                ".*_dp": {
                    "type": "short",
                    "doc_values": "false"
                },
                ".*_ab": {
                    "type": "half_float",
                    "doc_values": "false"
                },
                ".*_ab_PR": {
                    "type": "half_float",
                    "doc_values": "false"
                },
                ".*_ab_SR": {
                    "type": "half_float",
                    "doc_values": "false"
                },
                ".*_dp_PR": {
                    "type": "short",
                    "doc_values": "false"
                },
                ".*_dp_SR": {
                    "type": "short",
                    "doc_values": "false"
                },
            }
        else:
            raise ValueError("Unexpected args.analysis_type: %s" %
                             args.analysis_type)
    else:
        genotype_fields_to_export = []
        genotype_field_to_elasticsearch_type_map = {}

    vds = vds.persist()

    client = ElasticsearchClient(host, port)
    for i, sample_group in enumerate(sample_groups):

        if i < start_with_sample_group:
            continue

        #if delete_index_before_exporting and i < 4:
        #    continue

        if len(sample_groups) > 1:
            vds_sample_subset = vds.filter_samples_list(sample_group,
                                                        keep=True)
            current_index_name = "%s_%s" % (index_name, i)
        else:
            vds_sample_subset = vds
            current_index_name = index_name

        logger.info("==> exporting %s samples into %s" %
                    (len(sample_group), current_index_name))
        logger.info(
            "Samples: %s .. %s" %
            (", ".join(sample_group[:3]), ", ".join(sample_group[-3:])))

        logger.info("==> export to elasticsearch")
        pprint(vds.variant_schema)

        timestamp1 = time.time()

        client.export_vds_to_elasticsearch(
            vds_sample_subset,
            genotype_fields_to_export=genotype_fields_to_export,
            genotype_field_to_elasticsearch_type_map=
            genotype_field_to_elasticsearch_type_map,
            index_name=current_index_name,
            index_type_name=index_type,
            block_size=args.block_size,
            num_shards=args.num_shards,
            delete_index_before_exporting=delete_index_before_exporting,
            elasticsearch_write_operation=operation,
            elasticsearch_mapping_id="docId",
            disable_doc_values_for_fields=disable_doc_values_for_fields,
            disable_index_for_fields=disable_index_for_fields,
            is_split_vds=True,
            verbose=True,
        )

        timestamp2 = time.time()
        logger.info("==> finished exporting - time: %s seconds" %
                    (timestamp2 - timestamp1))

    if export_snapshot_to_google_bucket:

        logger.info("==> export snapshot to google bucket")
        client.create_elasticsearch_snapshot(
            index_name=index_name + "*",
            bucket="seqr-database-backups",
            base_path="elasticsearch/snapshots",
            snapshot_repo="callsets")

    client.save_index_operation_metadata(
        args.input_vds,
        index_name,
        args.genome_version,
        fam_file=args.fam_file,
        remap_sample_ids=args.remap_sample_ids,
        subset_samples=args.subset_samples,
        skip_vep=args.skip_vep,
        project_id=args.project_id,
        analysis_type=args.analysis_type,
        sample_type=args.sample_type,
        command=" ".join(sys.argv),
        directory=args.directory,
        username=args.username,
        operation="create_index",
        status="success",
    )
args = p.parse_args()

es = elasticsearch.Elasticsearch(args.host, port=args.port)

existing_indices = es.indices.get(index="*").keys()
if args.index not in existing_indices:
    p.error("%s not found. Existing indices are: %s" % (args.index, existing_indices))

# see https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-snapshots.html
snapshot_name = "snapshot_%s__%s" % (args.index.lower(), time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()))

# see https://www.elastic.co/guide/en/elasticsearch/plugins/current/repository-gcs-repository.html
print("==> Check if snapshot repo exists: %s" % args.repo)
repo_info = es.snapshot.get_repository(repository=args.repo)
pprint(repo_info)

print("==> Creating snapshot in gs://%s/%s for index %s" % (args.bucket, args.base_path, args.index))

client = ElasticsearchClient(args.host, args.port)

client.create_elasticsearch_snapshot(
    index_name = args.index + "*",
    bucket = args.bucket,
    base_path = args.base_path,
    snapshot_repo=args.repo)

print("==> Getting snapshot status for: " + snapshot_name)
pprint(
    es.snapshot.status(repository=args.repo)
)
Пример #6
0
    COVERAGE_PATHS = EXOME_COVERAGE_CSV_PATHS[-1]

kt_coverage = hc.import_table(COVERAGE_PATHS, types=types)
kt_coverage = kt_coverage.rename({
    '#chrom': 'chrom',
    '1': 'over1',
    '5': 'over5',
    '10': 'over10',
    '15': 'over15',
    '20': 'over20',
    '25': 'over25',
    '30': 'over30',
    '50': 'over50',
    '100': 'over100',
})
print(kt_coverage.schema)
print("======== Export exome coverage to elasticsearch ======")

es = ElasticsearchClient(
    host=args.host,
    port=args.port,
)

es.export_kt_to_elasticsearch(kt_coverage,
                              index_name=args.index,
                              index_type_name=args.index_type,
                              num_shards=args.num_shards,
                              block_size=args.block_size,
                              delete_index_before_exporting=True,
                              verbose=True)
    "category",
]

for field_name in transcript_annotations_to_keep:
    new_field_name = field_name.split("_")[0] + "".join(map(lambda word: word.capitalize(), field_name.split("_")[1:]))
    kt = kt.annotate("%(new_field_name)s = mainTranscript.%(field_name)s" % locals())

kt = kt.drop(["mainTranscript"])

# pprint(kt.schema)

DISABLE_INDEX_AND_DOC_VALUES_FOR_FIELDS = ("sortedTranscriptConsequences", )

print("======== Export to elasticsearch ======")
es = ElasticsearchClient(
    host=args.host,
    port=args.port,
)

es.export_kt_to_elasticsearch(
    kt,
    index_name=args.index,
    index_type_name=args.index_type,
    block_size=args.block_size,
    num_shards=args.num_shards,
    delete_index_before_exporting=True,
    disable_doc_values_for_fields=DISABLE_INDEX_AND_DOC_VALUES_FOR_FIELDS,
    disable_index_for_fields=DISABLE_INDEX_AND_DOC_VALUES_FOR_FIELDS,
    verbose=True,
)
import os
os.system(
    "pip install elasticsearch"
)  # this used to be `import pip; pip.main(['install', 'elasticsearch']);`, but pip.main is deprecated as of pip v10

import argparse
from utils.elasticsearch_client import ElasticsearchClient

p = argparse.ArgumentParser()
p.add_argument(
    "-H",
    "--host",
    help="elasticsearch client host. The default address works if "
    "`kubectl proxy` is running in the background.",
    default=
    "http://localhost:8001/api/v1/namespaces/default/services/elasticsearch:9200/proxy"
)
p.add_argument("-p",
               "--port",
               help="elasticsearch client port.",
               default="30001")

args = p.parse_args()

# to get the ip address, run  `kubectl describe pod elasticsearch-1019229749-vhghc`
ELASTICSEARCH_HOST = args.host
ELASTICSEARCH_PORT = args.port

es = ElasticsearchClient(ELASTICSEARCH_HOST, port=ELASTICSEARCH_PORT)
es.print_elasticsearch_stats()
    else:
        vds_sample_subset = vds
        index_name = args.index_name

    logger.info("==> loading %s samples into %s" %
                (len(sample_group), index_name))
    logger.info("Samples: %s .. %s" %
                (", ".join(sample_group[:3]), ", ".join(sample_group[-3:])))

    logger.info("==> export to elasticsearch")
    DISABLE_INDEX_FOR_FIELDS = ("sortedTranscriptConsequences", )
    DISABLE_DOC_VALUES_FOR_FIELDS = ("sortedTranscriptConsequences", )

    timestamp1 = time.time()
    es = ElasticsearchClient(
        host=args.host,
        port=args.port,
    )
    es.export_vds_to_elasticsearch(
        vds_sample_subset,
        genotype_fields_to_export=DEFAULT_GENOTYPE_FIELDS_TO_EXPORT,
        genotype_field_to_elasticsearch_type_map=
        DEFAULT_GENOTYPE_FIELD_TO_ELASTICSEARCH_TYPE_MAP,
        host=args.host,
        port=args.port,
        index_name=index_name,
        index_type_name=args.index_type,
        block_size=args.block_size,
        num_shards=args.num_shards,
        delete_index_before_exporting=True,
        disable_doc_values_for_fields=DISABLE_DOC_VALUES_FOR_FIELDS,
        disable_index_for_fields=DISABLE_INDEX_FOR_FIELDS,
if args.add_all_annotations or args.add_gnomad_coverage:
    logger.info("\n==> Add gnomad coverage")
    vds = vds.persist()
    vds = add_gnomad_exome_coverage_to_vds(hc,
                                           vds,
                                           args.genome_version,
                                           root="va.gnomad_exome_coverage")
    vds = add_gnomad_genome_coverage_to_vds(hc,
                                            vds,
                                            args.genome_version,
                                            root="va.gnomad_genome_coverage")

vds = vds.persist()

# write to VDS
client = ElasticsearchClient(args.host, args.port)
timestamp1 = time.time()

if args.dry_run:
    logger.info("Dry run finished. Next step would be to export to index: " +
                str(index_name))
else:
    client.export_vds_to_elasticsearch(
        vds,
        genotype_fields_to_export=genotype_fields_to_export,
        genotype_field_to_elasticsearch_type_map=
        genotype_field_to_elasticsearch_type_map,
        index_name=index_name,
        index_type_name="variant",
        block_size=args.block_size,
        num_shards=args.num_shards,