示例#1
0
    def execute(self):
        """
        Index data of specified queryset
        """
        start_time = time.time()

        for qs, progress in self.batch_qs():

            elapsed = time.time() - start_time

            total_left = (1 / (progress + 0.001)) * elapsed - elapsed

            progres_msg = \
                'PART: %s %.3f : duration: %.2f left: %.2f' % (
                    self.part, progress, elapsed, total_left
                )

            log.info(progres_msg)

            helpers.bulk(
                self.client,
                (self.convert(obj).to_dict(include_meta=True) for obj in qs),
                raise_on_error=True,
            )

        if settings.TESTING and self.index:
            idx = es.Index(self.index)
            # refresh index, make sure its ready for queries
            idx.refresh()
示例#2
0
 def __connect_to_elastic(self):
     # Creates a connection to elastic
     connections.create_connection(
         hosts=settings.ELASTIC_SEARCH_HOSTS,
         retry_on_timeout=True,
     )
     return es.Index(self.index)
示例#3
0
    def _perform_index_sync(self, sql_table_cls, es_doc_cls, id_logger):
        es_doc = es_doc_cls()

        elasticsearch_conn = connections.get_connection()

        sync_timestamp = current_server_timestamp()

        pending_insertions = self._compute_dirty_documents(
            sql_table_cls, es_doc.doc_type)

        bulk_op = self._synchronisation_op(es_doc, pending_insertions)

        self._logging(logging.INFO, 'Performing synchronization.')

        for ok, info in parallel_bulk(elasticsearch_conn, bulk_op):
            obj_id = info['index']['_id'] \
                if 'index' in info else info['update']['_id']

            if ok:
                # Mark the task as handled so we don't retreat it next time
                self._logging(
                    logging.INFO,
                    'Document %s has been synced successfully.' % obj_id)

                sql_table_cls.update_last_sync(obj_id, sync_timestamp)
            else:
                id_logger(obj_id, logging.ERROR,
                          'Error while syncing document %s index.' % obj_id)

        # Refresh indices to increase research speed
        elasticsearch_dsl.Index(es_doc.index).refresh()
示例#4
0
    def build_custom_dict(self):
        df = pd.read_excel(os.path.join(MEDIA_ROOT, 'dict.xlsx'))
        number_of_words = len(df)

        index = es.Index(ES_INDEX_CUSTOM_DICTIONARY_WORD, using=ES_CLIENT)
        index.delete(ignore=404)
        print("Creating index")
        CustomDictionaryWord.init()

        failed, success = 0, 0
        batch_size = 1000
        for ok, result in parallel_bulk(ES_CLIENT,
                                        self.word_generator(df),
                                        index=ES_INDEX_CUSTOM_DICTIONARY_WORD,
                                        chunk_size=batch_size,
                                        raise_on_error=False,
                                        thread_count=6):
            if ok:
                success += 1
            else:
                failed += 1
                action, result = result.popitem()
                print("!!!", action, result)

            if failed > 3:
                raise Exception("Too many failed!!")
            if (success + failed) % batch_size == 0:
                print(f'{success+failed}/{number_of_words} processed')
示例#5
0
    def execute(self):

        idx = es.Index(self.index)

        for dt in self.doc_types:
            idx.document(dt)
        idx.create()
        idx.refresh()
示例#6
0
def create_new_connection(address, index='training_jobs'):
    """
    Creates a new connection to elasticsearch.
    """
    elasticsearch_dsl.connections.create_connection(
        hosts=[address]
    )
    return elasticsearch_dsl.Index(index)
示例#7
0
    def test_index(self):
        """Initialize test index"""
        DataDocType.init()
        index = es.Index("test")

        try:
            index.delete()
        except elasticexceptions.NotFoundError:
            assert False
def create_index(index_name):
    """ Creates a new index, destroying any existing index with the same
        name. """
    index = edsl.Index(index_name)
    index.settings(number_of_shards=1)
    if index.exists():
        index.delete()
        print('old index deleted')
    index.create()
    return index
示例#9
0
    def execute(self):

        idx = es.Index(self.index)

        try:
            idx.delete(ignore=404)
            log.info("Deleted index %s", self.index)
        except AttributeError:
            log.warning("Could not delete index '%s', ignoring", self.index)
        except NotFoundError:
            log.warning("Index '%s' not found, ignoring", self.index)
示例#10
0
def create_indices(scanner):
    for regex in scanner.regexes:
        id_ = regex.id.lower()

        index_name = f'{INDEX_PREFIX}-{id_}'.lower()
        index = es_dsl.Index(index_name)
        if index.exists():
            index.delete()
        index.create()

        mapping = es_dsl.Mapping()
        add_field_mappings(id_, regex, mapping)
        mapping.save(index_name)
示例#11
0
    def execute(self):

        idx = es.Index(self.index)

        try:
            idx.delete(ignore=404)
            log.info("Deleted index %s", self.index)
        except AttributeError:
            log.warning("Could not delete index '%s', ignoring", self.index)

        for dt in self.doc_types:
            idx.document(dt)

        idx.create()
示例#12
0
 def handle(self, *args, **options):
     index_name = es_index_name
     elasticsearch_dsl.connections.connections.create_connection(hosts=["127.0.0.1"])
     if elasticsearch_dsl.Index(name=index_name).exists():
         elasticsearch_dsl.Index(name=index_name).delete()
     for process in process_all:
         print("Now process %r" % process.pk)
         host = Hosts.objects.get(server_uuid=process.server_uuid)
         esp = es_docs.EsProcess(meta={'id': process.pk},
                                 id=process.pk,
                                 p_name=process.p_name,
                                 p_status=process.p_status,
                                 p_cwd=process.p_cwd,
                                 p_exe=process.p_exe,
                                 p_username=process.p_username,
                                 p_create_time=process.p_create_time,
                                 p_cmdline=process.p_cmdline,
                                 listen_ip_port=process.listen_ip_port,
                                 # server_uuid=host.server_uuid,
                                 server=host.ip_addresses,
                                 old_mark=process.old_mark
                                 )
         esp.save()
示例#13
0
def es_client():
    client = _es_client()
    yield client
    # Push all changes to segments to make sure all annotations that were added get removed.
    elasticsearch_dsl.Index(client.index, using=client.conn).refresh()
    client.conn.delete_by_query(
        index=client.index,
        body={"query": {"match_all": {}}},
        # This query occassionally fails with a version conflict.
        # Forcing the deletion resolves the issue, but the exact
        # cause of the version conflict has not been found yet.
        conflicts="proceed",
        # Add refresh to make deletion changes show up in search results.
        refresh=True,
    )
示例#14
0
    def clear_index(cls, index=None):
        """
        Clears the index.
        """
        if not index:
            index = cls.get_index_config()
        if not index:
            raise Exception('Index not found!')

        connections.create_connection(hosts=index['connection']['hosts'])
        try:
            index_instance = es.Index(index['index_name'])
            index_instance.delete()
        except Exception:
            pass
        connections.remove_connection(index['connection_name'])
示例#15
0
 def handle(self, *args, **options):
     self.batch_size = options['batch_size']
     self.from_id = 0
     if "from_id" in options:
         self.from_id = options['from_id']
     self.to_id = None
     if "to_id" in options:
         self.to_id = options['to_id']
     self.client = ES_CLIENT
     if not self.from_id:
         print("Deleting index")
         index = es.Index(ES_INDEX_DOCUMENT, using=self.client)
         index.delete(ignore=404)
         print("Creating index")
         ESDocument.init()
     self.send_elastic()
示例#16
0
def _get_elasticsearch_index_samples(elasticsearch_index):
    sample_field_suffix = '_num_alt'

    index = elasticsearch_dsl.Index('{}*'.format(elasticsearch_index), using=get_es_client())
    try:
        field_mapping = index.get_field_mapping(fields=['*{}'.format(sample_field_suffix)], doc_type=[VARIANT_DOC_TYPE])
    except NotFoundError:
        raise Exception('Index "{}" not found'.format(elasticsearch_index))
    except TransportError as e:
        raise Exception(e.error)

    samples = set()
    for index in field_mapping.values():
        samples.update([key.split(sample_field_suffix)[0] for key in index.get('mappings', {}).get(VARIANT_DOC_TYPE, {}).keys()])
    if not samples:
        raise Exception('No sample fields found for index "{}"'.format(elasticsearch_index))
    return samples
示例#17
0
    def execute(self):
        index = current_app.config['ELASTIC_INDICES'][self.index_key]
        idx = es.Index(index)

        try:
            idx.delete(ignore=404)
        except NotFoundError:
            log.warning("Could not delete index '%s', ignoring", index)
        else:
            log.info("Deleted index %s", index)

        # create doc types
        for dt in self.doc_types:
            idx.doc_type(dt)

        # create index
        idx.create()
示例#18
0
    def _perform_index_purge(self, index_name, index_settings, doc_type_class):
        log_msg = 'Dropping %s index.' % index_name
        self._logging(logging.INFO, log_msg)

        index = elasticsearch_dsl.Index(index_name)
        index.settings(**index_settings)
        index.doc_type(doc_type_class)

        try:
            index.delete(ignore=404)
            index.create()
        except elasticsearch.exceptions.ElasticsearchException as e:
            log_msg = 'Error while dropping %s index: %s.' % (index_name, e)
            self._logging(logging.ERROR, log_msg)
            return

        log_msg = 'Index %s has been dropped successfully.' % index_name
        self._logging(logging.INFO, log_msg)
示例#19
0
    def create_index(self, index_name, run_logs):
        """ Creates a new index, destroying any existing index with the same
            name. """
        index = edsl.Index(index_name)

        index.settings(number_of_shards=3)
        if index.exists():
            index.delete()
            run_logs.insert_log('old {0} index deleted'.format(index_name))

        if index_name == "enc_dates_NOT_NOW":
            run_logs.insert_log("entered analyzer for {0}".format(index_name))
            enc_analyzer = analyzer('enc_analyzer',
                                    tokenizer="whitespace",
                                    filter=['lowercase'])
            index.analyzer(enc_analyzer)

        index.create()
        run_logs.insert_log('new {0} index created'.format(index_name))
        return index
示例#20
0
def es_client():
    client = _es_client()
    yield client
    # Push all changes to segments to make sure all annotations that were added get removed.
    elasticsearch_dsl.Index(client.index, using=client.conn).refresh()

    # Pylint can't understand the ES library
    # pylint: disable=unexpected-keyword-arg
    client.conn.delete_by_query(
        index=client.index,
        body={"query": {"match_all": {}}},
        # This query occasionally fails with a version conflict.
        # Forcing the deletion resolves the issue, but the exact
        # cause of the version conflict has not been found yet.
        conflicts="proceed",
        # Add refresh to make deletion changes show up in search results.
        refresh=True,
    )

    # Close connection to ES server to avoid ResourceWarning about a leaked TCP socket.
    client.close()
示例#21
0
def _get_elasticsearch_index_samples(elasticsearch_index, project):
    sample_field_suffix = '_num_alt'

    es_client = get_es_client(timeout=30)
    index = elasticsearch_dsl.Index('{}*'.format(elasticsearch_index),
                                    using=es_client)
    try:
        field_mapping = index.get_field_mapping(
            fields=['*{}'.format(sample_field_suffix), 'join_field'],
            doc_type=[VARIANT_DOC_TYPE])
    except NotFoundError:
        raise Exception('Index "{}" not found'.format(elasticsearch_index))
    except TransportError as e:
        raise Exception(e.error)

    #  Nested genotypes
    if field_mapping.get(elasticsearch_index,
                         {}).get('mappings', {}).get(VARIANT_DOC_TYPE,
                                                     {}).get('join_field'):
        max_samples = Individual.objects.filter(
            family__project=project).count()
        s = elasticsearch_dsl.Search(using=es_client,
                                     index=elasticsearch_index)
        s = s.params(size=0)
        s.aggs.bucket(
            'sample_ids',
            elasticsearch_dsl.A('terms', field='sample_id', size=max_samples))
        response = s.execute()
        return [agg['key'] for agg in response.aggregations.sample_ids.buckets]

    samples = set()
    for index in field_mapping.values():
        samples.update([
            key.split(sample_field_suffix)[0] for key in index.get(
                'mappings', {}).get(VARIANT_DOC_TYPE, {}).keys()
        ])
    if not samples:
        raise Exception('No sample fields found for index "{}"'.format(
            elasticsearch_index))
    return samples
示例#22
0
    def _perform_geocomplete_index_population(self, max_doc):
        elasticsearch_conn = connections.get_connection()

        to_index = list()

        for i, document in enumerate(self._geocompletion_documents()):
            if i % max_doc == 0:
                log_msg = 'Computing required geoloc-entry documents.'
                self._logging(logging.INFO, log_msg)

            to_index.append(document.to_dict(True))

            if len(to_index) < max_doc:
                continue

            self._geocomplete_index_batch(elasticsearch_conn, to_index)

            to_index = list()

        if len(to_index) != 0:
            self._geocomplete_index_batch(elasticsearch_conn, to_index)

        elasticsearch_dsl.Index('geocomplete').refresh()
示例#23
0
def _get_elasticsearch_index_samples(elasticsearch_index):
    es_client = get_es_client()

    #  Nested genotypes
    if is_nested_genotype_index(elasticsearch_index):
        s = elasticsearch_dsl.Search(using=es_client,
                                     index=elasticsearch_index)
        s = s.params(size=0)
        s.aggs.bucket(
            'sample_ids',
            elasticsearch_dsl.A('terms', field='samples_num_alt_1',
                                size=10000))
        response = s.execute()
        return [agg['key'] for agg in response.aggregations.sample_ids.buckets]

    sample_field_suffix = '_num_alt'
    index = elasticsearch_dsl.Index('{}*'.format(elasticsearch_index),
                                    using=es_client)
    try:
        field_mapping = index.get_field_mapping(
            fields=['*{}'.format(sample_field_suffix)],
            doc_type=[VARIANT_DOC_TYPE])
    except NotFoundError:
        raise Exception('Index "{}" not found'.format(elasticsearch_index))
    except TransportError as e:
        raise Exception(e.error)

    samples = set()
    for index in field_mapping.values():
        samples.update([
            key.split(sample_field_suffix)[0] for key in index.get(
                'mappings', {}).get(VARIANT_DOC_TYPE, {}).keys()
        ])
    if not samples:
        raise Exception('No sample fields found for index "{}"'.format(
            elasticsearch_index))
    return samples
示例#24
0
import ast
import logging
import pathlib
import re
import typing
from collections import namedtuple

import elasticsearch_dsl
import elasticsearch_dsl.connections
import pandas as pd
from pandas.errors import EmptyDataError

TRAINING_JOBS = 'training_jobs'
VALIDATION_JOBS = 'validation_jobs'
JOB_INDEX = elasticsearch_dsl.Index(TRAINING_JOBS)
VALIDATION_JOB_INDEX = elasticsearch_dsl.Index(VALIDATION_JOBS)

Metrics = namedtuple('Metrics', [
    'epochs', 'train_acc', 'final_val_acc', 'best_val_acc', 'final_val_loss',
    'best_val_loss', 'final_val_sensitivity', 'best_val_sensitivity',
    'final_val_specificity', 'best_val_specificity'
])


class TrainingJob(elasticsearch_dsl.Document):
    id = elasticsearch_dsl.Integer()
    schema_version = elasticsearch_dsl.Integer()
    job_name = elasticsearch_dsl.Keyword()
    author = elasticsearch_dsl.Keyword()
    created_at = elasticsearch_dsl.Date()
    ended_at = elasticsearch_dsl.Date()
示例#25
0
import elasticsearch_dsl as es
from django.conf import settings
from elasticsearch_dsl import analyzer, tokenizer

dutch_analyzer = es.analyzer('dutchanalyzer',
                             type='standard',
                             stopwords='_dutch_')

base_analyzer = analyzer('zorg_base_txt',
                         tokenizer=tokenizer('trigram',
                                             'nGram',
                                             min_gram=2,
                                             max_gram=20),
                         filter=['lowercase'])

_index = es.Index(settings.ELASTIC_INDEX)


@_index.doc_type
class Term(es.DocType):
    term = es.Text()
    gewicht = es.Integer()


@_index.doc_type
class Organisatie(es.DocType):
    ext_id = es.String(index='not_analyzed')
    naam = es.String(analyzer=dutch_analyzer)  # ngram
    beschrijving = es.String(analyzer=dutch_analyzer)
    afdeling = es.String(index='not_analyzed')
示例#26
0
import datetime
from datetime import timedelta
import logging

from six import iteritems, itervalues
import elasticsearch_dsl as esd


from .. import app


PhotoIndex = esd.Index(app.config["ELASTICSEARCH_INDEX"])
# So this allows bigger pagination in /photos, probably going past 10000/20 pages doesn't 
# make a whole lot of sense, and this should be solved differently, but until that time..
PhotoIndex.settings(max_result_window=500000) 

class ExtendedDateHistogramFacet(esd.DateHistogramFacet):
	# Temporary until the elasticsearch-dsl library includes the 'year' range
    DATE_INTERVALS = {
        'year': lambda d: (d+timedelta(days=366)).replace(day=1),
        'month': lambda d: (d+timedelta(days=32)).replace(day=1),
        'week': lambda d: d+timedelta(days=7),
        'day': lambda d: d+timedelta(days=1),
        'hour': lambda d: d+timedelta(hours=1),
    }

@PhotoIndex.doc_type
class PhotoDocument(esd.DocType):
    date = esd.Date()
    aperture = esd.Float()
    exposure = esd.Float()
示例#27
0
    division,
    print_function,
    unicode_literals,
)


import elasticsearch
import elasticsearch_dsl as dsl
from flask import current_app as app

from ..versioning import ArchivingDocType
from .exceptions import ConflictError, NotFoundError
from .aliases import get_alias, unalias


auth_index = dsl.Index('auth')
auth_index.settings(
    number_of_shards=2,
    number_of_replicas=1
)

class Customer(ArchivingDocType):
    """Model a customer."""

    name = dsl.Keyword()
    permissions = dsl.Object()
    cycles = dsl.Object()

    class Meta:
        index = auth_index._name
示例#28
0
    doc_id = es_dsl.Integer()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Create or reupload index')
    # parser.add_argument('--input', type=str,
    #                     help='Json file with preprocessed data')
    parser.add_argument('-i', '--index', type=str, help='Index name to create')
    args = parser.parse_args()

    indexName = args.index  # "query_tips_index_v0.3"
    # path = args.input  # "../data/data/data_clear_es_pm.json"
    properties = ["request", "request_lemms", "popularity"]  # without id

    esClient = Elasticsearch()
    index = es_dsl.Index(indexName, using=esClient)

    index.delete(ignore=404)
    index.settings(number_of_shards=1, number_of_replicas=0, analysis=analysis)
    index.doc_type(queryTipsIndex_doctype)  # <-- CHANGE NAME
    index.create()
    queryTipsIndex_doctype.init(using=esClient)

    with open('./vidal_total_dict.json', 'r', encoding='utf-8') as f_v:
        vidal_data = json.load(f_v)

    i = 0
    # json имеет вид: [1:{original:.. norm:.. llt_id:.. pt_id:..} 2:...]
    for elem in vidal_data.items():
        # print(normalize_text(elem[0]))
        # if i % 800 == 0: