예제 #1
0
def create_index(es, name, doc_class, synonyms=None, excluding_terms=None):
    """Crea un índice Elasticsearch utilizando un nombre y una clase de
    documento.

    Args:
        es (elasticsearch.Elasticsearch): Cliente Elasticsearch.
        name (str): Nombre del índice a crear.
        doc_class (type): Clase del documento (debe heredar de Document).
        synonyms (list): Lista de sinónimos a utilizar en caso de necesitar el
            analizador 'name_analyzer_synonyms'.
        excluding_terms (list): Lista de términos excluyentes a utilizar en
            caso de necesitar el analizador 'name_analyzer_excluding_terms'.

    """
    index = Index(name)

    # Crear el analizador 'name_analyzer_synonyms' solo si se lo pidió
    # explícitamente. Si el documento tipo 'doc_class' utiliza el analizador
    # en algún punto de su mapeo, la lista 'synonyms' debería estar presente.
    if synonyms is not None:
        index.analyzer(gen_name_analyzer_synonyms(synonyms))

    # Mismo razonamiento que con 'name_analyzer_synonyms'.
    if excluding_terms is not None:
        index.analyzer(gen_name_analyzer_excluding_terms(excluding_terms))

    index.document(doc_class)
    index.create(using=es)
예제 #2
0
def createIndex():
    connections.create_connection(hosts=[host], timeout=20)
    if not connections.get_connection().indices.exists(index):
        odrednicaIdx = Index(index)
        odrednicaIdx.analyzer(serbianAnalyzer)
        odrednicaIdx.document(Odrednica)
        odrednicaIdx.create()
예제 #3
0
def insert_document_to_index(documents, text_an, index, keep):
    client = Elasticsearch()

    idx = Index(index, using=client)

    if idx.exists() and not keep:
        print('Removing existing index...')
        idx.delete()

    if not idx.exists():
        print('Creating index')
        idx.create()

    idx.close()
    idx.analyzer(text_an)

    client.indices.put_mapping(
        doc_type='document',
        index=index,
        body={'document': {
            'properties': {
                'path': {
                    'type': 'keyword'
                }
            }
        }})

    idx.save()
    idx.open()

    print("Index settings=", idx.get_settings())
    print('Indexing ...')
    bulk(client, documents)
예제 #4
0
def setup_index(year):
    index = Index(f'{INDEX_NAME}-{year}')
    index.settings(number_of_shards=2, number_of_replicas=0)
    index.aliases(politicians={})
    index.document(Politicians)
    index.analyzer(brazilian_analyzer)
    index.create()
예제 #5
0
def test_analyzers_returned_from_to_dict():
    random_analyzer_name = ''.join((choice(string.ascii_letters) for _ in range(100)))
    random_analyzer = analyzer(random_analyzer_name, tokenizer="standard", filter="standard")
    index = Index('i', using='alias')
    index.analyzer(random_analyzer)

    assert index.to_dict()["settings"]["analysis"]["analyzer"][random_analyzer_name] == {"filter": ["standard"], "type": "custom", "tokenizer": "standard"}
예제 #6
0
def test_analyzers_returned_from_to_dict():
    random_analyzer_name = ''.join((choice(string.ascii_letters) for _ in range(100)))
    random_analyzer = analyzer(random_analyzer_name, tokenizer="standard", filter="standard")
    index = Index('i', using='alias')
    index.analyzer(random_analyzer)

    assert index.to_dict()["settings"]["analysis"]["analyzer"][random_analyzer_name] == {"filter": ["standard"], "type": "custom", "tokenizer": "standard"}
예제 #7
0
 def construct_index(cls, opts, bases):
     i = None
     if opts is None:
         # Inherit Index from base classes
         for b in bases:
             if getattr(b, "_index", DEFAULT_INDEX) is not DEFAULT_INDEX:
                 parent_index = b._index
                 i = Index(
                     parent_index._name,
                     doc_type=parent_index._mapping.doc_type,
                     using=parent_index._using,
                 )
                 i._settings = parent_index._settings.copy()
                 i._aliases = parent_index._aliases.copy()
                 i._analysis = parent_index._analysis.copy()
                 i._doc_types = parent_index._doc_types[:]
                 break
     if i is None:
         i = Index(
             getattr(opts, "name", "*"),
             doc_type=getattr(opts, "doc_type", "doc"),
             using=getattr(opts, "using", "default"),
         )
     i.settings(**getattr(opts, "settings", {}))
     i.aliases(**getattr(opts, "aliases", {}))
     for a in getattr(opts, "analyzers", ()):
         i.analyzer(a)
     return i
예제 #8
0
def insert_documents_to_index(documents, an, index):
    client = Elasticsearch()
    idx = Index(index, using=client)
    if idx.exists():
        idx.delete()

    idx.settings(number_of_shards=1)
    idx.create()

    idx = Index(index, using=client)
    idx.close()
    idx.analyzer(an)

    client.indices.put_mapping(
        doc_type='document',
        index=index,
        body={'document': {
            'properties': {
                'path': {
                    'type': 'keyword'
                }
            }
        }})

    idx.save()
    idx.open()

    print('Index settings=', idx.get_settings())
    print('Indexing ...')
    bulk(client, documents)
def add_company_search_index_and_populate(apps, schema_editor):
    companies = Index('companies')
    if not companies.exists():
        companies.doc_type(CompanyDocType)
        companies.analyzer(analyzer('english'))
        companies.create()
    management.call_command('populate_elasticsearch')
    def handle(self, *args, **options):
        companies = Index('companies')
        companies.delete(ignore=404)

        companies.doc_type(CompanyDocType)
        companies.analyzer(analyzer('english'))
        companies.create()
        management.call_command('populate_elasticsearch')
예제 #11
0
 def create_index(self, name, doc_type, alias):
     index = Index(name)
     index.document(doc_type)
     index.analyzer(analyzer('english'))
     # give the index an alias (e.g, `company_alias`), so the index is used
     # when the application searches from or inserts into `campaign_alias`.
     index.aliases(**{alias: {}})  # same  as .aliases(company-alias: {})
     index.create()
     doc_type._index = index
     return index
예제 #12
0
def test_conflicting_analyzer_raises_error():
    i = Index('i')
    i.analyzer('my_analyzer',
               tokenizer='whitespace',
               filter=['lowercase', 'stop'])

    with raises(ValueError):
        i.analyzer('my_analyzer',
                   tokenizer='keyword',
                   filter=['lowercase', 'stop'])
예제 #13
0
 def construct_index(cls, opts, bases):
     i = Index(
         getattr(opts, "name", "*"),
         doc_type=getattr(opts, "doc_type", "doc"),
         using=getattr(opts, "using", "default"),
     )
     i.settings(**getattr(opts, "settings", {}))
     i.aliases(**getattr(opts, "aliases", {}))
     for a in getattr(opts, "analyzers", ()):
         i.analyzer(a)
     return i
예제 #14
0
파일: indexer.py 프로젝트: unsftn/rsj
def create_index_if_needed():
    try:
        for es_idx in ALL_INDEXES:
            if not connections.get_connection().indices.exists(
                    es_idx['index']):
                idx = Index(es_idx['index'])
                idx.analyzer(SERBIAN_ANALYZER)
                idx.document(es_idx['document'])
                idx.create()
    except Exception as ex:
        log.fatal(ex)
 def create_index(self, name, document, alias):
     index = Index(name)
     index.document(document)
     index.analyzer(analyzer('english'))
     # give the index an alias (e.g, `company_alias`), so the index is used
     # when the application searches from or inserts into `campaign_alias`.
     index.aliases(**{alias: {}})  # same  as .aliases(company-alias: {})
     index.create()
     document._index = index
     self.stdout.write(self.style.SUCCESS('New index created'))
     return index
예제 #16
0
    def init_index(cls):
        '''
        Class method to init index
        '''

        # default analyzer
        shingle_filter = token_filter(
            'shingle_filter',
            type='shingle',
            min_shingle_size=2,
            max_shingle_size=3,
        )
        default_analyzer = analyzer(
            'default',
            tokenizer='standard',
            char_filter=['html_strip'],
            filter=['lowercase', 'asciifolding', shingle_filter]
        )

        # set the analyzers for the available languages
        # TODO: languages and languages_stopwords should be in settings
        languages = ('es', 'en')
        languages_stopwords = {
            'en': '_english_',
            'es': '_spanish_',
        }
        languages_analyzers = {}
        languages_filters = {}
        for language in languages:
            languages_filters[language] = token_filter(
                language + '_filter',
                type='stop',
                stopwords=languages_stopwords[language],
            )
            languages_analyzers[language] = analyzer(
                language + '_analyzer',
                tokenizer='standard',
                char_filter=['html_strip'],
                filter=['lowercase', 'asciifolding', languages_filters[language]]
            )

        # Add analyzers, the index has to be closed before any configuration
        searches_index = Index('searches')
        # default analyzer
        searches_index.analyzer(default_analyzer)
        # languages search analyzers
        for language in languages:
            searches_index.analyzer(languages_analyzers[language])
        searches_index.save()

        # create the mappings in elasticsearch
        cls.init()
예제 #17
0
def test_cloned_index_has_analysis_attribute():
    """
    Regression test for Issue #582 in which `Index.clone()` was not copying
    over the `_analysis` attribute.
    """
    client = object()
    i = Index('my-index', using=client)

    random_analyzer_name = ''.join((choice(string.ascii_letters) for _ in range(100)))
    random_analyzer = analyzer(random_analyzer_name, tokenizer="standard", filter="standard")

    i.analyzer(random_analyzer)

    i2 = i.clone('my-clone-index')

    assert i.to_dict()['settings']['analysis'] == i2.to_dict()['settings']['analysis']
예제 #18
0
def test_cloned_index_has_analysis_attribute():
    """
    Regression test for Issue #582 in which `Index.clone()` was not copying
    over the `_analysis` attribute.
    """
    client = object()
    i = Index('my-index', using=client)

    random_analyzer_name = ''.join((choice(string.ascii_letters) for _ in range(100)))
    random_analyzer = analyzer(random_analyzer_name, tokenizer="standard", filter="standard")

    i.analyzer(random_analyzer)

    i2 = i.clone('my-clone-index')

    assert i.to_dict()['settings']['analysis'] == i2.to_dict()['settings']['analysis']
예제 #19
0
def setup_indices():
    index = Index(f'{INDEX_NAME}-index')

    index.settings(number_of_shards=1, number_of_replicas=0)

    index.aliases(politicians={})

    index.document(Politicians)

    index.analyzer(analyzer('brazilian'))

    index_template = Politicians._index.as_template(
        INDEX_NAME,
        f'{INDEX_NAME}-*',
    )
    index_template.save()
    def handle(self, *args, **options):
        text_analyzer = get_text_analyzer("german")
        elastic_index = Index("mst_debug")
        if not elastic_index.exists():
            elastic_index.create()
        elastic_index.close()
        elastic_index.analyzer(text_analyzer)
        elastic_index.save()
        elastic_index.open()
        elastic_index.flush()

        for word in options["words"]:
            analysis = elastic_index.analyze(body={
                "analyzer": "text_analyzer",
                "text": word
            })
            tokens = [i["token"] for i in analysis["tokens"]]
            self.stdout.write("{} {}\n".format(word, tokens))
예제 #21
0
def add_analyzer(index: Index):
    """Agrega un nuevo analyzer al índice, disponible para ser usado
    en todos sus fields. El analyzer aplica lower case + ascii fold:
    quita acentos y uso de ñ, entre otros, para permitir búsqueda de
    texto en español
    """

    synonyms = list(Synonym.objects.values_list('terms', flat=True))

    filters = ['lowercase', 'asciifolding']
    if synonyms:
        filters.append(
            token_filter(constants.SYNONYM_FILTER,
                         type='synonym',
                         synonyms=synonyms))

    index.analyzer(
        analyzer(constants.ANALYZER, tokenizer='standard', filter=filters))
예제 #22
0
    def open_spider(self, spider):

        self.client = Elasticsearch()
        try:
            # Drop index if it exists
            ind = Index(self.elastic_db, using=self.client)
            ind.delete()
        except NotFoundError:
            pass
        # then create it
        ind.create()
        ind.close()
        # Configure tokenizer
        my_analyzer = analyzer('default', type='custom',
            tokenizer=tokenizer('standard'),
            filter=['lowercase', 'asciifolding'])
        ind.analyzer(my_analyzer)
        ind.save()
        ind.open()
예제 #23
0
def create_index(index_name, mapping, alias_names=()):
    """
    Creates an index, initialises it with a mapping, and optionally associates aliases with it.

    Note: If you need to perform multiple alias operations atomically, you should use
    start_alias_transaction() instead of specifying aliases when creating an index.
    """
    index = Index(index_name, mapping.doc_type)
    for analyzer in ANALYZERS:
        index.analyzer(analyzer)

    index.settings(**settings.ES_INDEX_SETTINGS)
    index.mapping(mapping)

    # ES allows you to specify filter criteria for aliases but we don't make use of that –
    # hence the empty dict for each alias
    alias_mapping = {alias_name: {} for alias_name in alias_names}
    index.aliases(**alias_mapping)

    index.create()
예제 #24
0
    def index_queries(self, tags_path):
        """
        (Re)Creates the index with synonyms, and saves the species names as query documents.
        """
        species = self._read_tags(tags_path)

        index = Index(self.index)
        index.doc_type(self.query_doc_type)

        log.info('Building analyzer')
        index.analyzer(self._analyzer(species))

        log.info('(Re)Creating index')
        index.delete(ignore=404)
        index.create()

        log.info('Registering queries')
        for s in species:
            query_doc = self.query_doc_type(query=self._mk_query_body(s))
            query_doc.save()
예제 #25
0
    def index_queries(self, countries_path, synonyms_path):
        """
        (Re)Creates the index with synonyms, and saves the country query documents.
        """
        countries = self._read_tags(countries_path)
        synonyms = self._read_tags(synonyms_path)

        index = Index(self.index)
        index.doc_type(self.query_doc_type)

        log.info('Building analyzer')
        index.analyzer(self._analyzer(synonyms))

        log.info('(Re)Creating index')
        index.delete(ignore=404)
        index.create()

        log.info('Registering queries')
        for c in countries:
            query_doc = self.query_doc_type(query=self._mk_query_body(c))
            query_doc.save()
blogs.settings(
    number_of_shard=1,
    number_of_replicas=0
)

blogs.aliases(
    old_blogs={}
)

# Register a doc_type with the index
blogs.doc_type(Post)


#Also used as a class decorator
@blogs.doc_type
class Post(DocType):
    title = Text()


# Atatch a custom analyzer
html_strip = analyzer('html_strip',
        tokenizer='standard',
        filter=['standard', 'lowercase', 'stop', 'snowball'],
            char_filter(['html_strip']))

blogs.analyzer(html_strip)

# Delete index, ignor if it doesn't exist
blogs.delete(ignore=404)
blogs.create()
예제 #27
0
    tokenizer=tokenizer(
        "ukrainianTokenizer", type="pattern", pattern="[А-ЯЄІЇҐа-яєіїґA-Za-z0-9']+"
    ),
    filter=[
        token_filter(
            "shingleFilter",
            type="shingle",
            max_shingle_size=5,
            min_shingle_size=2,
            output_unigrams=True,
        ),
        "lowercase",
    ],
)

addresses_idx.analyzer(shingle_analyzer)

companies_idx = Index(COMPANIES_INDEX)
companies_idx.settings(number_of_shards=settings.NUM_THREADS, number_of_replicas=0)

namesAutocompleteAnalyzer = analyzer(
    "namesAutocompleteAnalyzer",
    tokenizer=tokenizer(
        "autocompleteTokenizer",
        type="edge_ngram",
        min_gram=1,
        max_gram=25,
        token_chars=["letter", "digit"],
    ),
    filter=["lowercase"],
)
예제 #28
0
from django.urls import reverse

from abstract.elastic_models import (
    BASIC_INDEX_SETTINGS,
    AbstractDatasetMapping,
    namesAutocompleteAnalyzer,
    namesAutocompleteSearchAnalyzer,
    ukrainianAddressesStopwordsAnalyzer,
)
from elasticsearch_dsl import DocType, Index

LETS_PARTY_INDEX = "ragoogle_lets_party"
lets_party_idx = Index(LETS_PARTY_INDEX)
lets_party_idx.settings(**BASIC_INDEX_SETTINGS)

lets_party_idx.analyzer(namesAutocompleteAnalyzer)
lets_party_idx.analyzer(namesAutocompleteSearchAnalyzer)
lets_party_idx.analyzer(ukrainianAddressesStopwordsAnalyzer)


@lets_party_idx.doc_type
class ElasticLetsPartyModel(AbstractDatasetMapping):
    start_date = Keyword()
    end_date = Keyword()

    def render_infocard(self):
        from .apps import LetsPartyConfig as AppConfig

        return render_to_string(
            "lets_party/infocard.html",
            {
예제 #29
0
def test_conflicting_analyzer_raises_error():
    i = Index('i')
    i.analyzer('my_analyzer', tokenizer='whitespace', filter=['lowercase', 'stop'])

    with raises(ValueError):
        i.analyzer('my_analyzer', tokenizer='keyword', filter=['lowercase', 'stop'])
예제 #30
0
from dateutil.relativedelta import relativedelta

from abstract.elastic_models import (
    BASIC_INDEX_SETTINGS,
    AbstractDatasetMapping,
    namesAutocompleteAnalyzer,
    namesAutocompleteSearchAnalyzer,
    ukrainianAddressesStopwordsAnalyzer,
)
from elasticsearch_dsl import DocType, Index

SMIDA_REPORT_INDEX = "ragoogle_smida_report"
smida_report_idx = Index(SMIDA_REPORT_INDEX)
smida_report_idx.settings(**BASIC_INDEX_SETTINGS)

smida_report_idx.analyzer(namesAutocompleteAnalyzer)
smida_report_idx.analyzer(namesAutocompleteSearchAnalyzer)
smida_report_idx.analyzer(ukrainianAddressesStopwordsAnalyzer)


@smida_report_idx.doc_type
class ElasticSmidaReportModel(AbstractDatasetMapping):
    def render_infocard(self):
        from .apps import SmidaReportConfig as AppConfig

        return render_to_string(
            "smida_reports/infocard.html",
            {
                "res": self,
                "url": self.get_absolute_url(),
                "datasource_name": AppConfig.name,
예제 #31
0
            for member in family:
                if hasattr(member, "family_name"):
                    yield member.family_name
        else:
            for member in parse_raw_family_string(
                getattr(self.general, "family_raw", "")
            ):
                if "family_name" in member:
                    yield member["family_name"]


declarations_idx = Index(OLD_DECLARATION_INDEX)
declarations_idx.settings(
    number_of_shards=NUMBER_OF_SHARDS, number_of_replicas=NUMBER_OF_REPLICAS
)
declarations_idx.analyzer(namesAutocompleteAnalyzer)
declarations_idx.analyzer(namesAutocompleteSearchAnalyzer)


@declarations_idx.doc_type
class Declaration(DocType, AbstractDeclaration):
    """Declaration document.
    Assumes there's a dynamic mapping with all fields not indexed by default."""

    persons = Text(analyzer="ukrainian", copy_to="all")
    countries = Text(analyzer="ukrainian", copy_to="all")
    companies = Text(analyzer="ukrainian", copy_to="all")
    names_autocomplete = Text(
        analyzer="namesAutocompleteAnalyzer",
        search_analyzer="namesAutocompleteSearchAnalyzer",
        fields={"raw": Text(index=True)},
예제 #32
0
from abstract.elastic_models import (
    BASIC_INDEX_SETTINGS,
    AbstractDatasetMapping,
    namesAutocompleteAnalyzer,
    namesAutocompleteSearchAnalyzer,
    ukrainianAddressesStopwordsAnalyzer,
)
from elasticsearch_dsl import DocType, Index

TAX_REG_INDEX = "ragoogle_tax_reg"
tax_reg_idx = Index(TAX_REG_INDEX)
tax_reg_idx.settings(**BASIC_INDEX_SETTINGS)


tax_reg_idx.analyzer(namesAutocompleteAnalyzer)
tax_reg_idx.analyzer(namesAutocompleteSearchAnalyzer)
tax_reg_idx.analyzer(ukrainianAddressesStopwordsAnalyzer)


@tax_reg_idx.doc_type
class ElasticTaxRegModel(AbstractDatasetMapping):
    start_date = Keyword()
    end_date = Keyword()

    def render_infocard(self):
        from .apps import TaxRegConfig as AppConfig

        return render_to_string(
            "tax_reg/infocard.html",
            {
예제 #33
0
}


class Address(DocType):
    """Address document."""

    class Meta:
        index = 'garnahata_addresses'



OWNERSHIP_INDEX = "garnahata_ownerships"
ownership_idx = Index(OWNERSHIP_INDEX)
ownership_idx.settings(**BASIC_INDEX_SETTINGS)

ownership_idx.analyzer(namesAutocompleteAnalyzer)
ownership_idx.analyzer(namesAutocompleteSearchAnalyzer)
ownership_idx.analyzer(ukrainianAddressesStopwordsAnalyzer)


@ownership_idx.doc_type
class Ownership(DocType):
    """Ownership document."""
    addresses = Text(analyzer="ukrainianAddressesStopwordsAnalyzer", copy_to="all")
    persons = Text(analyzer="ukrainian", copy_to="all")
    companies = Text(analyzer="ukrainian", copy_to="all")
    registered = Date()
    mortgage_registered = Date()
    names_autocomplete = Text(
        analyzer="namesAutocompleteAnalyzer",
        search_analyzer="namesAutocompleteSearchAnalyzer",
예제 #34
0
    try:
        # Drop index if it exists
        ind = Index(index, using=client)
        ind.delete()
    except NotFoundError:
        pass
    # then create it
    ind.settings(number_of_shards=1)
    ind.create()

    ind = Index(index, using=client)

    # configure default analyzer
    ind.close()  # index must be closed for configuring analyzer
    ind.analyzer(my_analyzer)

    # configure the path field so it is not tokenized and we can do exact match search
    client.indices.put_mapping(
        doc_type='document',
        index=index,
        body={"document": {
            "properties": {
                "path": {
                    "type": "keyword",
                }
            }
        }})

    ind.save()
    ind.open()
예제 #35
0
def buildIndex():
    """
    buildIndex creates a new film index, deleting any existing index of
    the same name.
    It loads a json file containing the covid doc metadata corpus and does bulk loading
    using a generator function.
    """
    doc_index = Index('covid_doc_index')
    if doc_index.exists():
        doc_index.delete()  # Overwrite any previous version
    doc_index.analyzer(
        basic_analyzer
    )  # register your customized analyzer as the default analyzer
    doc_index.create()

    # Open the covid metadata corpus
    with open('covid_comm_use_subset_meta.json', 'r',
              encoding='utf-8') as data_file:
        enum_id = 1
        documents = {}
        for line in data_file:
            try:
                doc = json.loads(line)
                for key in doc.keys():
                    if type(doc.get(key)) is not str:
                        if math.isnan(doc.get(key)):
                            doc.update({key: None})
                    if key == "publish_time" and doc.get(key) and len(
                            doc.get(key)) > 4:
                        match = re.search(year, doc[key]).group(0)
                        if match:
                            doc.update({key: match})
                documents.update({str(enum_id): doc})
                enum_id += 1
            except json.decoder.JSONDecodeError:
                continue
        # load doc metadata from json file into dictionary
        size = len(documents)

    # Action series for bulk loading with helpers.bulk function.
    # Implemented as a generator, to return one movie with each call.
    # Note that we include the index name here.
    # The Document type is always 'doc'.
    # Every item to be indexed must have a unique key.
    def actions():
        # enum_id is an enumerated id created when reading the json and used as key into covid metadata dictionary)
        for enum_id in range(1, size + 1):
            yield {
                "_index":
                "covid_doc_index",
                "_type":
                'doc',
                "_id":
                enum_id,
                "title":
                documents[str(enum_id)].get('title', 'None'),
                "text":
                documents[str(enum_id)].get('abstract', 'None'),
                "authors":
                documents[str(enum_id)].get('authors', 'None'),
                "publish_time":
                documents[str(enum_id)].get('publish_time', int(0000))
            }

    helpers.bulk(es, actions())