def push_data_keywords(pub_ids=None, index=None): """ Go through all the publications and their datatables and move data keywords from tables to their parent publications. """ if not pub_ids: body = {"query": {"match_all": {}}} results = es.search(index=index, doc_type=CFG_PUB_TYPE, body=body, _source=False) pub_ids = [i["_id"] for i in results["hits"]["hits"]] for pub_id in pub_ids: query_builder = QueryBuilder() query_builder.add_child_parent_relation( "publication", relation="parent", must=True, related_query={"match": {"recid": pub_id}} ) tables = es.search(index=index, doc_type=CFG_DATA_TYPE, body=query_builder.query, _source_include="keywords") keywords = [d["_source"].get("keywords", None) for d in tables["hits"]["hits"]] # Flatten the list keywords = [i for inner in keywords for i in inner] # Aggregate agg_keywords = defaultdict(list) for kw in keywords: agg_keywords[kw["name"]].append(kw["value"]) # Remove duplicates for k, v in agg_keywords.items(): agg_keywords[k] = list(set(v)) body = {"doc": {"data_keywords": dict(agg_keywords)}} try: es.update(index=index, doc_type=CFG_PUB_TYPE, id=pub_id, body=body) except Exception as e: log.error(e.message)
def push_data_keywords(pub_ids=None, index=None): """ Go through all the publications and their datatables and move data keywords from tables to their parent publications. """ if not pub_ids: body = {'query': {'match_all': {}}} results = es.search(index=index, doc_type=CFG_PUB_TYPE, body=body, _source=False) pub_ids = [i['_id'] for i in results['hits']['hits']] for pub_id in pub_ids: query_builder = QueryBuilder() query_builder.add_child_parent_relation( 'publication', relation='parent', must=True, related_query={'match': { 'recid': pub_id }}) tables = es.search(index=index, doc_type=CFG_DATA_TYPE, body=query_builder.query, _source_include='keywords') keywords = [ d['_source'].get('keywords', None) for d in tables['hits']['hits'] ] # Flatten the list keywords = [i for inner in keywords for i in inner] # Aggregate agg_keywords = defaultdict(list) for kw in keywords: agg_keywords[kw['name']].append(kw['value']) # Remove duplicates for k, v in agg_keywords.items(): agg_keywords[k] = list(set(v)) body = {"doc": {'data_keywords': dict(agg_keywords)}} try: es.update(index=index, doc_type=CFG_PUB_TYPE, id=pub_id, body=body) except Exception as e: log.error(e.message)
def search(query, index=None, filters=list(), size=10, include="*", exclude="authors", offset=0, sort_field=None, sort_order='', post_filter=None): """ Perform a search query. :param query: [string] query string e.g. 'higgs boson' :param index: [string] name of the index. If None a default is used :param filters: [list of tuples] list of filters for the query. Currently supported: ('author', author_fullname), ('collaboration', collaboration_name), ('date', date) :param size: [int] max number of hits that should be returned :param offset: [int] offset for the results (used for pagination) :param sort_by: [string] sorting field. Currently supported fields: "title", "collaboration", "date", "relevance" :param sort_order: [string] order of the sorting either original (for a particular field) or reversed. Supported: '' or 'rev' :return: [dict] dictionary with processed results and facets """ # If empty query then sort by date if query == '' and not sort_field: sort_field = 'date' query = HEPDataQueryParser.parse_query(query) # Build core query data_query = get_query_by_type(CFG_DATA_TYPE, query) pub_query = get_query_by_type(CFG_PUB_TYPE, query) authors_query = get_authors_query(query) query_builder = QueryBuilder() query_builder.add_child_parent_relation( CFG_DATA_TYPE, relation="child", related_query=data_query, other_queries=[pub_query, authors_query]) # Add additional options query_builder.add_pagination(size=size, offset=offset) query_builder.add_sorting(sort_field=sort_field, sort_order=sort_order) query_builder.add_filters(filters) query_builder.add_post_filter(post_filter) query_builder.add_aggregations() query_builder.add_source_filter(include, exclude) pub_result = es.search(index=index, body=query_builder.query, doc_type=CFG_PUB_TYPE) parent_filter = { "filtered": { "filter": { "terms": { "_id": [hit["_id"] for hit in pub_result['hits']['hits']] } } } } query_builder = QueryBuilder() query_builder.add_child_parent_relation(CFG_PUB_TYPE, relation="parent", related_query=parent_filter, must=True, other_queries=[data_query]) query_builder.add_pagination(size=size * 50) data_result = es.search(index=index, body=query_builder.query, doc_type=CFG_DATA_TYPE) merged_results = merge_results(pub_result, data_result) return map_result(merged_results)
def search( query, index=None, filters=list(), size=10, include="*", exclude="", offset=0, sort_field=None, sort_order="", post_filter=None, ): """ Perform a search query. :param query: [string] query string e.g. 'higgs boson' :param index: [string] name of the index. If None a default is used :param filters: [list of tuples] list of filters for the query. Currently supported: ('author', author_fullname), ('collaboration', collaboration_name), ('date', date) :param size: [int] max number of hits that should be returned :param offset: [int] offset for the results (used for pagination) :param sort_by: [string] sorting field. Currently supported fields: "title", "collaboration", "date", "relevance" :param sort_order: [string] order of the sorting either original (for a particular field) or reversed. Supported: '' or 'rev' :return: [dict] dictionary with processed results and facets """ # If empty query then sort by date if query == "" and not sort_field: sort_field = "date" query = HEPDataQueryParser.parse_query(query) # Build core query data_query = get_query_by_type(CFG_DATA_TYPE, query) pub_query = get_query_by_type(CFG_PUB_TYPE, query) authors_query = get_authors_query(query) query_builder = QueryBuilder() query_builder.add_child_parent_relation( CFG_DATA_TYPE, relation="child", related_query=data_query, other_queries=[pub_query, authors_query] ) # Add additional options query_builder.add_pagination(size=size, offset=offset) query_builder.add_sorting(sort_field=sort_field, sort_order=sort_order) query_builder.add_filters(filters) query_builder.add_post_filter(post_filter) query_builder.add_aggregations() query_builder.add_source_filter(include, exclude) pub_result = es.search(index=index, body=query_builder.query, doc_type=CFG_PUB_TYPE) parent_filter = {"filtered": {"filter": {"terms": {"_id": [hit["_id"] for hit in pub_result["hits"]["hits"]]}}}} query_builder = QueryBuilder() query_builder.add_child_parent_relation( CFG_PUB_TYPE, relation="parent", related_query=parent_filter, must=True, other_queries=[data_query] ) query_builder.add_pagination(size=size * 50) data_result = es.search(index=index, body=query_builder.query, doc_type=CFG_DATA_TYPE) merged_results = merge_results(pub_result, data_result) return map_result(merged_results)