Exemplo n.º 1
0
    def company_name_iterator(self, max_size=100**100):

        # companyname_field = "arbetsgivarenamn.keyword"
        # es_index = 'platsannonser_gdpr_behandlade'
        companyname_field = "employer.name.keyword"
        es_index = settings.ES_INDEX

        query_unique_names = {
            "size": 0,
            "aggs": {
                "names_count": {
                    "cardinality": {
                        "field": companyname_field
                    }
                }
            }
        }

        unique_res = elastic.search(index=es_index, body=query_unique_names)

        unique_amount = int(unique_res['aggregations']['names_count']['value'])

        batch_size = 1000
        num_partitions = int(math.ceil(unique_amount / batch_size))

        aggs_query = {
            "size": 0,
            "aggs": {
                "names_agg": {
                    "terms": {
                        "field": companyname_field,
                        "include": {
                            "partition": 0,
                            "num_partitions": num_partitions
                        },
                        "size": batch_size
                    }
                }
            }
        }

        i = 0
        for partition_counter in range(num_partitions):
            aggs_query['aggs']['names_agg']['terms']['include'][
                'partition'] = partition_counter
            res = elastic.search(index=es_index, body=aggs_query)

            for bucket in res['aggregations']['names_agg']['buckets']:
                if i == max_size: break
                i = i + 1
                yield bucket['key']
Exemplo n.º 2
0
def autocomplete(querystring):
    if not querystring:
        querystring = ''
    without_last = ' '.join(querystring.split(' ')[:-1])
    query_dsl = _parse_args({
        settings.FREETEXT_QUERY: without_last,
        settings.LIMIT: 0,
        settings.SHOW_EXPIRED: 'false'
    })
    complete = querystring.split(' ')[-1]
    query_dsl['aggs'] = {
        'complete': {
            "terms": {
                "field": "keywords.raw",
                "size": 20,
                "include": "%s.*" % complete
            }
        }
    }
    query_result = elastic.search(index=settings.ES_AURANEST, body=query_dsl)
    if 'aggregations' in query_result:
        return [
            c['key'] for c in query_result.get('aggregations', {}).get(
                'complete', {}).get('buckets', [])
        ]
    return []
Exemplo n.º 3
0
def get_stats_for(taxonomy_type):
    value_path = {
        taxonomy.OCCUPATION: "%s.%s.keyword" %
        (fields.OCCUPATION, fields.LEGACY_AMS_TAXONOMY_ID),
        taxonomy.GROUP: "%s.%s.keyword" % (
            fields.OCCUPATION_GROUP, fields.LEGACY_AMS_TAXONOMY_ID),
        taxonomy.FIELD: "%s.%s.keyword" % (
            fields.OCCUPATION_FIELD, fields.LEGACY_AMS_TAXONOMY_ID),
        taxonomy.SKILL: "%s.%s.keyword" % (fields.MUST_HAVE_SKILLS,
                                           fields.LEGACY_AMS_TAXONOMY_ID),
        taxonomy.MUNICIPALITY: "%s.keyword" % fields.WORKPLACE_ADDRESS_MUNICIPALITY,
        taxonomy.REGION: "%s.keyword" % fields.WORKPLACE_ADDRESS_REGION
    }
    # Make sure we don't crash if we want to stat on missing type
    for tt in taxonomy_type:
        if tt not in value_path:
            log.warning("Taxonomy type \"%s\" not configured for aggs." % taxonomy_type)
            return {}

    aggs_query = {
        "from": 0, "size": 0,
        "query": {
            "bool": {
                "must": [{"match_all": {}}],
                'filter': [
                    {
                        'range': {
                            fields.PUBLICATION_DATE: {
                                'lte': 'now/m'
                            }
                        }
                    },
                    {
                        'range': {
                            fields.LAST_PUBLICATION_DATE: {
                                'gte': 'now/m'
                            }
                        }
                    },
                    {
                        'term': {
                            fields.REMOVED: False
                        }
                    },
                ]
            }
        },
        "aggs": {
            "antal_annonser": {
                "terms": {"field": value_path[taxonomy_type[0]], "size": 50},
            }
        }
    }
    log.debug('aggs_query: %s' % json.dumps(aggs_query))
    aggs_result = elastic.search(index=settings.ES_INDEX, body=aggs_query)

    code_count = {
        item['key']: item['doc_count']
        for item in aggs_result['aggregations']['antal_annonser']['buckets']}
    return code_count
Exemplo n.º 4
0
def fetch_platsannons(ad_id):
    try:
        query_result = elastic.get(index=settings.ES_INDEX, id=ad_id, ignore=404)
        if query_result and '_source' in query_result:
            return _format_ad(query_result)
        else:
            ext_id_query = {
                'query': {
                    'term': {
                        fields.EXTERNAL_ID: ad_id
                    }
                }
            }
            query_result = elastic.search(index=settings.ES_INDEX, body=ext_id_query)
            hits = query_result.get('hits', {}).get('hits', [])
            if hits:
                return _format_ad(hits[0])

            log.info("Job ad %s not found, returning 404 message" % ad_id)
            abort(404, 'Ad not found')
    except exceptions.NotFoundError:
        logging.exception('Failed to find id: %s' % ad_id)
        abort(404, 'Ad not found')
        return
    except exceptions.ConnectionError as e:
        logging.exception('Failed to connect to elasticsearch: %s' % str(e))
        abort(500, 'Failed to establish connection to database')
        return
def find_platsannonser(args, querybuilder, start_time=0, x_fields=None):
    if start_time == 0:
        start_time = int(time.time() * 1000)
    query_dsl = querybuilder.parse_args(args, x_fields)
    log.debug("Query constructed after %d milliseconds." %
              (int(time.time() * 1000) - start_time))
    try:
        # First pass, find highest score:
        if args.get(settings.MIN_RELEVANCE):
            max_score_query = query_dsl.copy()
            max_score_query['from'] = 0
            max_score_query['size'] = 1
            max_score_query['track_total_hits'] = False
            del max_score_query['aggs']
            del max_score_query['sort']
            max_score_result = elastic.search(index=settings.ES_INDEX,
                                              body=max_score_query)
            max_score = max_score_result.get('hits', {}).get('max_score')
            if max_score:
                query_dsl['min_score'] = max_score * args.get(
                    settings.MIN_RELEVANCE)
        log.debug("ARGS %s => QUERY: %s" % (args, json.dumps(query_dsl)))
        query_result = elastic.search(index=settings.ES_INDEX, body=query_dsl)
        log.debug("Elastic results after %d milliseconds." %
                  (int(time.time() * 1000) - start_time))
    except exceptions.ConnectionError as e:
        logging.exception('Failed to connect to elasticsearch: %s' % str(e))
        abort(500, 'Failed to establish connection to database')
        return

    if args.get(settings.FREETEXT_QUERY):
        query_result['concepts'] = \
            _extract_concept_from_concepts(
                ttc.text_to_concepts(args.get(settings.FREETEXT_QUERY))
            )

    log.debug("Elasticsearch reports: took=%d, timed_out=%s" %
              (query_result.get('took', 0), query_result.get('timed_out', '')))
    return transform_platsannons_query_result(args, query_result, querybuilder)
Exemplo n.º 6
0
def find_annonser(args):
    aggregates = _statistics(args.pop(settings.STATISTICS),
                             args.pop(settings.STAT_LMT))
    query_dsl = _parse_args(args)
    query_dsl['aggs'] = {"total": {"cardinality": {"field": "group.id"}}}
    if aggregates:
        query_dsl['aggs'].update(aggregates)
    log.debug(json.dumps(query_dsl, indent=2))
    try:
        query_result = elastic.search(index=settings.ES_AURANEST, body=query_dsl)
    except exceptions.ConnectionError as e:
        logging.exception('Failed to connect to elasticsearch: %s' % str(e))
        abort(500, 'Failed to establish connection to database')
        return
    return query_result
Exemplo n.º 7
0
def find_platsannonser(args, querybuilder, start_time=0):
    if start_time == 0:
        start_time = int(time.time() * 1000)
    query_dsl = querybuilder.parse_args(args)
    log.debug(json.dumps(query_dsl, indent=2))
    log.debug("Query constructed after %d milliseconds." %
              (int(time.time() * 1000) - start_time))

    try:
        query_result = elastic.search(index=settings.ES_INDEX, body=query_dsl)
        log.debug("Elastic results after %d milliseconds." %
                  (int(time.time() * 1000) - start_time))
    except exceptions.ConnectionError as e:
        logging.exception('Failed to connect to elasticsearch: %s' % str(e))
        abort(500, 'Failed to establish connection to database')
        return
    log.debug("Elasticsearch reports: took=%d, timed_out=%s" %
              (query_result.get('took', 0), query_result.get('timed_out', '')))
    return transform_platsannons_query_result(args, query_result, querybuilder)
Exemplo n.º 8
0
def get_stats_for(taxonomy_type):
    log.info("Looking for %s" % taxonomy_type)
    value_path = {
        taxonomy.JobtechTaxonomy.OCCUPATION_NAME:
        "yrkesroll.taxonomi-kod.keyword",
        taxonomy.JobtechTaxonomy.OCCUPATION_GROUP:
        "yrkesgrupp.taxonomi-kod.keyword",
        taxonomy.JobtechTaxonomy.OCCUPATION_FIELD:
        "yrkesomrade.taxonomi-kod.keyword",
        taxonomy.JobtechTaxonomy.SKILL:
        "krav.kompetenser.taxonomi-kod.keyword",
        taxonomy.JobtechTaxonomy.WORKTIME_EXTENT:
        "arbetstidstyp.taxonomi-kod.keyword",
        taxonomy.JobtechTaxonomy.MUNICIPALITY:
        "arbetsplatsadress.taxonomi-kommun.keyword",
        taxonomy.JobtechTaxonomy.REGION:
        "arbetsplatsadress.taxonomi-lan.keyword"
    }
    # Make sure we don't crash if we want to stat on missing type
    if taxonomy_type not in value_path:
        log.warning("Taxonomy type %s not configured for aggs." %
                    taxonomy_type)
        return {}

    aggs_query = {
        "from": 0,
        "size": 0,
        "query": {
            "bool": {
                "must": [{
                    "match_all": {}
                }],
                'filter': [
                    {
                        'range': {
                            'publiceringsdatum': {
                                'lte': 'now/m'
                            }
                        }
                    },
                    {
                        'range': {
                            'status.sista_publiceringsdatum': {
                                'gte': 'now/m'
                            }
                        }
                    },
                ]
            }
        },
        "aggs": {
            "antal_annonser": {
                "terms": {
                    "field": value_path[taxonomy_type],
                    "size": 5000
                },
            }
        }
    }
    log.debug('aggs_query', aggs_query)
    aggs_result = elastic.search(index=settings.ES_INDEX, body=aggs_query)
    code_count = {
        item['key']: item['doc_count']
        for item in aggs_result['aggregations']['antal_annonser']['buckets']
    }
    return code_count