def company_name_iterator(self, max_size=100**100): # companyname_field = "arbetsgivarenamn.keyword" # es_index = 'platsannonser_gdpr_behandlade' companyname_field = "employer.name.keyword" es_index = settings.ES_INDEX query_unique_names = { "size": 0, "aggs": { "names_count": { "cardinality": { "field": companyname_field } } } } unique_res = elastic.search(index=es_index, body=query_unique_names) unique_amount = int(unique_res['aggregations']['names_count']['value']) batch_size = 1000 num_partitions = int(math.ceil(unique_amount / batch_size)) aggs_query = { "size": 0, "aggs": { "names_agg": { "terms": { "field": companyname_field, "include": { "partition": 0, "num_partitions": num_partitions }, "size": batch_size } } } } i = 0 for partition_counter in range(num_partitions): aggs_query['aggs']['names_agg']['terms']['include'][ 'partition'] = partition_counter res = elastic.search(index=es_index, body=aggs_query) for bucket in res['aggregations']['names_agg']['buckets']: if i == max_size: break i = i + 1 yield bucket['key']
def autocomplete(querystring): if not querystring: querystring = '' without_last = ' '.join(querystring.split(' ')[:-1]) query_dsl = _parse_args({ settings.FREETEXT_QUERY: without_last, settings.LIMIT: 0, settings.SHOW_EXPIRED: 'false' }) complete = querystring.split(' ')[-1] query_dsl['aggs'] = { 'complete': { "terms": { "field": "keywords.raw", "size": 20, "include": "%s.*" % complete } } } query_result = elastic.search(index=settings.ES_AURANEST, body=query_dsl) if 'aggregations' in query_result: return [ c['key'] for c in query_result.get('aggregations', {}).get( 'complete', {}).get('buckets', []) ] return []
def get_stats_for(taxonomy_type): value_path = { taxonomy.OCCUPATION: "%s.%s.keyword" % (fields.OCCUPATION, fields.LEGACY_AMS_TAXONOMY_ID), taxonomy.GROUP: "%s.%s.keyword" % ( fields.OCCUPATION_GROUP, fields.LEGACY_AMS_TAXONOMY_ID), taxonomy.FIELD: "%s.%s.keyword" % ( fields.OCCUPATION_FIELD, fields.LEGACY_AMS_TAXONOMY_ID), taxonomy.SKILL: "%s.%s.keyword" % (fields.MUST_HAVE_SKILLS, fields.LEGACY_AMS_TAXONOMY_ID), taxonomy.MUNICIPALITY: "%s.keyword" % fields.WORKPLACE_ADDRESS_MUNICIPALITY, taxonomy.REGION: "%s.keyword" % fields.WORKPLACE_ADDRESS_REGION } # Make sure we don't crash if we want to stat on missing type for tt in taxonomy_type: if tt not in value_path: log.warning("Taxonomy type \"%s\" not configured for aggs." % taxonomy_type) return {} aggs_query = { "from": 0, "size": 0, "query": { "bool": { "must": [{"match_all": {}}], 'filter': [ { 'range': { fields.PUBLICATION_DATE: { 'lte': 'now/m' } } }, { 'range': { fields.LAST_PUBLICATION_DATE: { 'gte': 'now/m' } } }, { 'term': { fields.REMOVED: False } }, ] } }, "aggs": { "antal_annonser": { "terms": {"field": value_path[taxonomy_type[0]], "size": 50}, } } } log.debug('aggs_query: %s' % json.dumps(aggs_query)) aggs_result = elastic.search(index=settings.ES_INDEX, body=aggs_query) code_count = { item['key']: item['doc_count'] for item in aggs_result['aggregations']['antal_annonser']['buckets']} return code_count
def fetch_platsannons(ad_id): try: query_result = elastic.get(index=settings.ES_INDEX, id=ad_id, ignore=404) if query_result and '_source' in query_result: return _format_ad(query_result) else: ext_id_query = { 'query': { 'term': { fields.EXTERNAL_ID: ad_id } } } query_result = elastic.search(index=settings.ES_INDEX, body=ext_id_query) hits = query_result.get('hits', {}).get('hits', []) if hits: return _format_ad(hits[0]) log.info("Job ad %s not found, returning 404 message" % ad_id) abort(404, 'Ad not found') except exceptions.NotFoundError: logging.exception('Failed to find id: %s' % ad_id) abort(404, 'Ad not found') return except exceptions.ConnectionError as e: logging.exception('Failed to connect to elasticsearch: %s' % str(e)) abort(500, 'Failed to establish connection to database') return
def find_platsannonser(args, querybuilder, start_time=0, x_fields=None): if start_time == 0: start_time = int(time.time() * 1000) query_dsl = querybuilder.parse_args(args, x_fields) log.debug("Query constructed after %d milliseconds." % (int(time.time() * 1000) - start_time)) try: # First pass, find highest score: if args.get(settings.MIN_RELEVANCE): max_score_query = query_dsl.copy() max_score_query['from'] = 0 max_score_query['size'] = 1 max_score_query['track_total_hits'] = False del max_score_query['aggs'] del max_score_query['sort'] max_score_result = elastic.search(index=settings.ES_INDEX, body=max_score_query) max_score = max_score_result.get('hits', {}).get('max_score') if max_score: query_dsl['min_score'] = max_score * args.get( settings.MIN_RELEVANCE) log.debug("ARGS %s => QUERY: %s" % (args, json.dumps(query_dsl))) query_result = elastic.search(index=settings.ES_INDEX, body=query_dsl) log.debug("Elastic results after %d milliseconds." % (int(time.time() * 1000) - start_time)) except exceptions.ConnectionError as e: logging.exception('Failed to connect to elasticsearch: %s' % str(e)) abort(500, 'Failed to establish connection to database') return if args.get(settings.FREETEXT_QUERY): query_result['concepts'] = \ _extract_concept_from_concepts( ttc.text_to_concepts(args.get(settings.FREETEXT_QUERY)) ) log.debug("Elasticsearch reports: took=%d, timed_out=%s" % (query_result.get('took', 0), query_result.get('timed_out', ''))) return transform_platsannons_query_result(args, query_result, querybuilder)
def find_annonser(args): aggregates = _statistics(args.pop(settings.STATISTICS), args.pop(settings.STAT_LMT)) query_dsl = _parse_args(args) query_dsl['aggs'] = {"total": {"cardinality": {"field": "group.id"}}} if aggregates: query_dsl['aggs'].update(aggregates) log.debug(json.dumps(query_dsl, indent=2)) try: query_result = elastic.search(index=settings.ES_AURANEST, body=query_dsl) except exceptions.ConnectionError as e: logging.exception('Failed to connect to elasticsearch: %s' % str(e)) abort(500, 'Failed to establish connection to database') return return query_result
def find_platsannonser(args, querybuilder, start_time=0): if start_time == 0: start_time = int(time.time() * 1000) query_dsl = querybuilder.parse_args(args) log.debug(json.dumps(query_dsl, indent=2)) log.debug("Query constructed after %d milliseconds." % (int(time.time() * 1000) - start_time)) try: query_result = elastic.search(index=settings.ES_INDEX, body=query_dsl) log.debug("Elastic results after %d milliseconds." % (int(time.time() * 1000) - start_time)) except exceptions.ConnectionError as e: logging.exception('Failed to connect to elasticsearch: %s' % str(e)) abort(500, 'Failed to establish connection to database') return log.debug("Elasticsearch reports: took=%d, timed_out=%s" % (query_result.get('took', 0), query_result.get('timed_out', ''))) return transform_platsannons_query_result(args, query_result, querybuilder)
def get_stats_for(taxonomy_type): log.info("Looking for %s" % taxonomy_type) value_path = { taxonomy.JobtechTaxonomy.OCCUPATION_NAME: "yrkesroll.taxonomi-kod.keyword", taxonomy.JobtechTaxonomy.OCCUPATION_GROUP: "yrkesgrupp.taxonomi-kod.keyword", taxonomy.JobtechTaxonomy.OCCUPATION_FIELD: "yrkesomrade.taxonomi-kod.keyword", taxonomy.JobtechTaxonomy.SKILL: "krav.kompetenser.taxonomi-kod.keyword", taxonomy.JobtechTaxonomy.WORKTIME_EXTENT: "arbetstidstyp.taxonomi-kod.keyword", taxonomy.JobtechTaxonomy.MUNICIPALITY: "arbetsplatsadress.taxonomi-kommun.keyword", taxonomy.JobtechTaxonomy.REGION: "arbetsplatsadress.taxonomi-lan.keyword" } # Make sure we don't crash if we want to stat on missing type if taxonomy_type not in value_path: log.warning("Taxonomy type %s not configured for aggs." % taxonomy_type) return {} aggs_query = { "from": 0, "size": 0, "query": { "bool": { "must": [{ "match_all": {} }], 'filter': [ { 'range': { 'publiceringsdatum': { 'lte': 'now/m' } } }, { 'range': { 'status.sista_publiceringsdatum': { 'gte': 'now/m' } } }, ] } }, "aggs": { "antal_annonser": { "terms": { "field": value_path[taxonomy_type], "size": 5000 }, } } } log.debug('aggs_query', aggs_query) aggs_result = elastic.search(index=settings.ES_INDEX, body=aggs_query) code_count = { item['key']: item['doc_count'] for item in aggs_result['aggregations']['antal_annonser']['buckets'] } return code_count