def create_index(es, name, doc_class, synonyms=None, excluding_terms=None): """Crea un índice Elasticsearch utilizando un nombre y una clase de documento. Args: es (elasticsearch.Elasticsearch): Cliente Elasticsearch. name (str): Nombre del índice a crear. doc_class (type): Clase del documento (debe heredar de Document). synonyms (list): Lista de sinónimos a utilizar en caso de necesitar el analizador 'name_analyzer_synonyms'. excluding_terms (list): Lista de términos excluyentes a utilizar en caso de necesitar el analizador 'name_analyzer_excluding_terms'. """ index = Index(name) # Crear el analizador 'name_analyzer_synonyms' solo si se lo pidió # explícitamente. Si el documento tipo 'doc_class' utiliza el analizador # en algún punto de su mapeo, la lista 'synonyms' debería estar presente. if synonyms is not None: index.analyzer(gen_name_analyzer_synonyms(synonyms)) # Mismo razonamiento que con 'name_analyzer_synonyms'. if excluding_terms is not None: index.analyzer(gen_name_analyzer_excluding_terms(excluding_terms)) index.document(doc_class) index.create(using=es)
def createIndex(): connections.create_connection(hosts=[host], timeout=20) if not connections.get_connection().indices.exists(index): odrednicaIdx = Index(index) odrednicaIdx.analyzer(serbianAnalyzer) odrednicaIdx.document(Odrednica) odrednicaIdx.create()
def insert_document_to_index(documents, text_an, index, keep): client = Elasticsearch() idx = Index(index, using=client) if idx.exists() and not keep: print('Removing existing index...') idx.delete() if not idx.exists(): print('Creating index') idx.create() idx.close() idx.analyzer(text_an) client.indices.put_mapping( doc_type='document', index=index, body={'document': { 'properties': { 'path': { 'type': 'keyword' } } }}) idx.save() idx.open() print("Index settings=", idx.get_settings()) print('Indexing ...') bulk(client, documents)
def setup_index(year): index = Index(f'{INDEX_NAME}-{year}') index.settings(number_of_shards=2, number_of_replicas=0) index.aliases(politicians={}) index.document(Politicians) index.analyzer(brazilian_analyzer) index.create()
def test_analyzers_returned_from_to_dict(): random_analyzer_name = ''.join((choice(string.ascii_letters) for _ in range(100))) random_analyzer = analyzer(random_analyzer_name, tokenizer="standard", filter="standard") index = Index('i', using='alias') index.analyzer(random_analyzer) assert index.to_dict()["settings"]["analysis"]["analyzer"][random_analyzer_name] == {"filter": ["standard"], "type": "custom", "tokenizer": "standard"}
def construct_index(cls, opts, bases): i = None if opts is None: # Inherit Index from base classes for b in bases: if getattr(b, "_index", DEFAULT_INDEX) is not DEFAULT_INDEX: parent_index = b._index i = Index( parent_index._name, doc_type=parent_index._mapping.doc_type, using=parent_index._using, ) i._settings = parent_index._settings.copy() i._aliases = parent_index._aliases.copy() i._analysis = parent_index._analysis.copy() i._doc_types = parent_index._doc_types[:] break if i is None: i = Index( getattr(opts, "name", "*"), doc_type=getattr(opts, "doc_type", "doc"), using=getattr(opts, "using", "default"), ) i.settings(**getattr(opts, "settings", {})) i.aliases(**getattr(opts, "aliases", {})) for a in getattr(opts, "analyzers", ()): i.analyzer(a) return i
def insert_documents_to_index(documents, an, index): client = Elasticsearch() idx = Index(index, using=client) if idx.exists(): idx.delete() idx.settings(number_of_shards=1) idx.create() idx = Index(index, using=client) idx.close() idx.analyzer(an) client.indices.put_mapping( doc_type='document', index=index, body={'document': { 'properties': { 'path': { 'type': 'keyword' } } }}) idx.save() idx.open() print('Index settings=', idx.get_settings()) print('Indexing ...') bulk(client, documents)
def add_company_search_index_and_populate(apps, schema_editor): companies = Index('companies') if not companies.exists(): companies.doc_type(CompanyDocType) companies.analyzer(analyzer('english')) companies.create() management.call_command('populate_elasticsearch')
def handle(self, *args, **options): companies = Index('companies') companies.delete(ignore=404) companies.doc_type(CompanyDocType) companies.analyzer(analyzer('english')) companies.create() management.call_command('populate_elasticsearch')
def create_index(self, name, doc_type, alias): index = Index(name) index.document(doc_type) index.analyzer(analyzer('english')) # give the index an alias (e.g, `company_alias`), so the index is used # when the application searches from or inserts into `campaign_alias`. index.aliases(**{alias: {}}) # same as .aliases(company-alias: {}) index.create() doc_type._index = index return index
def test_conflicting_analyzer_raises_error(): i = Index('i') i.analyzer('my_analyzer', tokenizer='whitespace', filter=['lowercase', 'stop']) with raises(ValueError): i.analyzer('my_analyzer', tokenizer='keyword', filter=['lowercase', 'stop'])
def construct_index(cls, opts, bases): i = Index( getattr(opts, "name", "*"), doc_type=getattr(opts, "doc_type", "doc"), using=getattr(opts, "using", "default"), ) i.settings(**getattr(opts, "settings", {})) i.aliases(**getattr(opts, "aliases", {})) for a in getattr(opts, "analyzers", ()): i.analyzer(a) return i
def create_index_if_needed(): try: for es_idx in ALL_INDEXES: if not connections.get_connection().indices.exists( es_idx['index']): idx = Index(es_idx['index']) idx.analyzer(SERBIAN_ANALYZER) idx.document(es_idx['document']) idx.create() except Exception as ex: log.fatal(ex)
def create_index(self, name, document, alias): index = Index(name) index.document(document) index.analyzer(analyzer('english')) # give the index an alias (e.g, `company_alias`), so the index is used # when the application searches from or inserts into `campaign_alias`. index.aliases(**{alias: {}}) # same as .aliases(company-alias: {}) index.create() document._index = index self.stdout.write(self.style.SUCCESS('New index created')) return index
def init_index(cls): ''' Class method to init index ''' # default analyzer shingle_filter = token_filter( 'shingle_filter', type='shingle', min_shingle_size=2, max_shingle_size=3, ) default_analyzer = analyzer( 'default', tokenizer='standard', char_filter=['html_strip'], filter=['lowercase', 'asciifolding', shingle_filter] ) # set the analyzers for the available languages # TODO: languages and languages_stopwords should be in settings languages = ('es', 'en') languages_stopwords = { 'en': '_english_', 'es': '_spanish_', } languages_analyzers = {} languages_filters = {} for language in languages: languages_filters[language] = token_filter( language + '_filter', type='stop', stopwords=languages_stopwords[language], ) languages_analyzers[language] = analyzer( language + '_analyzer', tokenizer='standard', char_filter=['html_strip'], filter=['lowercase', 'asciifolding', languages_filters[language]] ) # Add analyzers, the index has to be closed before any configuration searches_index = Index('searches') # default analyzer searches_index.analyzer(default_analyzer) # languages search analyzers for language in languages: searches_index.analyzer(languages_analyzers[language]) searches_index.save() # create the mappings in elasticsearch cls.init()
def test_cloned_index_has_analysis_attribute(): """ Regression test for Issue #582 in which `Index.clone()` was not copying over the `_analysis` attribute. """ client = object() i = Index('my-index', using=client) random_analyzer_name = ''.join((choice(string.ascii_letters) for _ in range(100))) random_analyzer = analyzer(random_analyzer_name, tokenizer="standard", filter="standard") i.analyzer(random_analyzer) i2 = i.clone('my-clone-index') assert i.to_dict()['settings']['analysis'] == i2.to_dict()['settings']['analysis']
def setup_indices(): index = Index(f'{INDEX_NAME}-index') index.settings(number_of_shards=1, number_of_replicas=0) index.aliases(politicians={}) index.document(Politicians) index.analyzer(analyzer('brazilian')) index_template = Politicians._index.as_template( INDEX_NAME, f'{INDEX_NAME}-*', ) index_template.save()
def handle(self, *args, **options): text_analyzer = get_text_analyzer("german") elastic_index = Index("mst_debug") if not elastic_index.exists(): elastic_index.create() elastic_index.close() elastic_index.analyzer(text_analyzer) elastic_index.save() elastic_index.open() elastic_index.flush() for word in options["words"]: analysis = elastic_index.analyze(body={ "analyzer": "text_analyzer", "text": word }) tokens = [i["token"] for i in analysis["tokens"]] self.stdout.write("{} {}\n".format(word, tokens))
def add_analyzer(index: Index): """Agrega un nuevo analyzer al índice, disponible para ser usado en todos sus fields. El analyzer aplica lower case + ascii fold: quita acentos y uso de ñ, entre otros, para permitir búsqueda de texto en español """ synonyms = list(Synonym.objects.values_list('terms', flat=True)) filters = ['lowercase', 'asciifolding'] if synonyms: filters.append( token_filter(constants.SYNONYM_FILTER, type='synonym', synonyms=synonyms)) index.analyzer( analyzer(constants.ANALYZER, tokenizer='standard', filter=filters))
def open_spider(self, spider): self.client = Elasticsearch() try: # Drop index if it exists ind = Index(self.elastic_db, using=self.client) ind.delete() except NotFoundError: pass # then create it ind.create() ind.close() # Configure tokenizer my_analyzer = analyzer('default', type='custom', tokenizer=tokenizer('standard'), filter=['lowercase', 'asciifolding']) ind.analyzer(my_analyzer) ind.save() ind.open()
def create_index(index_name, mapping, alias_names=()): """ Creates an index, initialises it with a mapping, and optionally associates aliases with it. Note: If you need to perform multiple alias operations atomically, you should use start_alias_transaction() instead of specifying aliases when creating an index. """ index = Index(index_name, mapping.doc_type) for analyzer in ANALYZERS: index.analyzer(analyzer) index.settings(**settings.ES_INDEX_SETTINGS) index.mapping(mapping) # ES allows you to specify filter criteria for aliases but we don't make use of that – # hence the empty dict for each alias alias_mapping = {alias_name: {} for alias_name in alias_names} index.aliases(**alias_mapping) index.create()
def index_queries(self, tags_path): """ (Re)Creates the index with synonyms, and saves the species names as query documents. """ species = self._read_tags(tags_path) index = Index(self.index) index.doc_type(self.query_doc_type) log.info('Building analyzer') index.analyzer(self._analyzer(species)) log.info('(Re)Creating index') index.delete(ignore=404) index.create() log.info('Registering queries') for s in species: query_doc = self.query_doc_type(query=self._mk_query_body(s)) query_doc.save()
def index_queries(self, countries_path, synonyms_path): """ (Re)Creates the index with synonyms, and saves the country query documents. """ countries = self._read_tags(countries_path) synonyms = self._read_tags(synonyms_path) index = Index(self.index) index.doc_type(self.query_doc_type) log.info('Building analyzer') index.analyzer(self._analyzer(synonyms)) log.info('(Re)Creating index') index.delete(ignore=404) index.create() log.info('Registering queries') for c in countries: query_doc = self.query_doc_type(query=self._mk_query_body(c)) query_doc.save()
blogs.settings( number_of_shard=1, number_of_replicas=0 ) blogs.aliases( old_blogs={} ) # Register a doc_type with the index blogs.doc_type(Post) #Also used as a class decorator @blogs.doc_type class Post(DocType): title = Text() # Atatch a custom analyzer html_strip = analyzer('html_strip', tokenizer='standard', filter=['standard', 'lowercase', 'stop', 'snowball'], char_filter(['html_strip'])) blogs.analyzer(html_strip) # Delete index, ignor if it doesn't exist blogs.delete(ignore=404) blogs.create()
tokenizer=tokenizer( "ukrainianTokenizer", type="pattern", pattern="[А-ЯЄІЇҐа-яєіїґA-Za-z0-9']+" ), filter=[ token_filter( "shingleFilter", type="shingle", max_shingle_size=5, min_shingle_size=2, output_unigrams=True, ), "lowercase", ], ) addresses_idx.analyzer(shingle_analyzer) companies_idx = Index(COMPANIES_INDEX) companies_idx.settings(number_of_shards=settings.NUM_THREADS, number_of_replicas=0) namesAutocompleteAnalyzer = analyzer( "namesAutocompleteAnalyzer", tokenizer=tokenizer( "autocompleteTokenizer", type="edge_ngram", min_gram=1, max_gram=25, token_chars=["letter", "digit"], ), filter=["lowercase"], )
from django.urls import reverse from abstract.elastic_models import ( BASIC_INDEX_SETTINGS, AbstractDatasetMapping, namesAutocompleteAnalyzer, namesAutocompleteSearchAnalyzer, ukrainianAddressesStopwordsAnalyzer, ) from elasticsearch_dsl import DocType, Index LETS_PARTY_INDEX = "ragoogle_lets_party" lets_party_idx = Index(LETS_PARTY_INDEX) lets_party_idx.settings(**BASIC_INDEX_SETTINGS) lets_party_idx.analyzer(namesAutocompleteAnalyzer) lets_party_idx.analyzer(namesAutocompleteSearchAnalyzer) lets_party_idx.analyzer(ukrainianAddressesStopwordsAnalyzer) @lets_party_idx.doc_type class ElasticLetsPartyModel(AbstractDatasetMapping): start_date = Keyword() end_date = Keyword() def render_infocard(self): from .apps import LetsPartyConfig as AppConfig return render_to_string( "lets_party/infocard.html", {
from dateutil.relativedelta import relativedelta from abstract.elastic_models import ( BASIC_INDEX_SETTINGS, AbstractDatasetMapping, namesAutocompleteAnalyzer, namesAutocompleteSearchAnalyzer, ukrainianAddressesStopwordsAnalyzer, ) from elasticsearch_dsl import DocType, Index SMIDA_REPORT_INDEX = "ragoogle_smida_report" smida_report_idx = Index(SMIDA_REPORT_INDEX) smida_report_idx.settings(**BASIC_INDEX_SETTINGS) smida_report_idx.analyzer(namesAutocompleteAnalyzer) smida_report_idx.analyzer(namesAutocompleteSearchAnalyzer) smida_report_idx.analyzer(ukrainianAddressesStopwordsAnalyzer) @smida_report_idx.doc_type class ElasticSmidaReportModel(AbstractDatasetMapping): def render_infocard(self): from .apps import SmidaReportConfig as AppConfig return render_to_string( "smida_reports/infocard.html", { "res": self, "url": self.get_absolute_url(), "datasource_name": AppConfig.name,
for member in family: if hasattr(member, "family_name"): yield member.family_name else: for member in parse_raw_family_string( getattr(self.general, "family_raw", "") ): if "family_name" in member: yield member["family_name"] declarations_idx = Index(OLD_DECLARATION_INDEX) declarations_idx.settings( number_of_shards=NUMBER_OF_SHARDS, number_of_replicas=NUMBER_OF_REPLICAS ) declarations_idx.analyzer(namesAutocompleteAnalyzer) declarations_idx.analyzer(namesAutocompleteSearchAnalyzer) @declarations_idx.doc_type class Declaration(DocType, AbstractDeclaration): """Declaration document. Assumes there's a dynamic mapping with all fields not indexed by default.""" persons = Text(analyzer="ukrainian", copy_to="all") countries = Text(analyzer="ukrainian", copy_to="all") companies = Text(analyzer="ukrainian", copy_to="all") names_autocomplete = Text( analyzer="namesAutocompleteAnalyzer", search_analyzer="namesAutocompleteSearchAnalyzer", fields={"raw": Text(index=True)},
from abstract.elastic_models import ( BASIC_INDEX_SETTINGS, AbstractDatasetMapping, namesAutocompleteAnalyzer, namesAutocompleteSearchAnalyzer, ukrainianAddressesStopwordsAnalyzer, ) from elasticsearch_dsl import DocType, Index TAX_REG_INDEX = "ragoogle_tax_reg" tax_reg_idx = Index(TAX_REG_INDEX) tax_reg_idx.settings(**BASIC_INDEX_SETTINGS) tax_reg_idx.analyzer(namesAutocompleteAnalyzer) tax_reg_idx.analyzer(namesAutocompleteSearchAnalyzer) tax_reg_idx.analyzer(ukrainianAddressesStopwordsAnalyzer) @tax_reg_idx.doc_type class ElasticTaxRegModel(AbstractDatasetMapping): start_date = Keyword() end_date = Keyword() def render_infocard(self): from .apps import TaxRegConfig as AppConfig return render_to_string( "tax_reg/infocard.html", {
} class Address(DocType): """Address document.""" class Meta: index = 'garnahata_addresses' OWNERSHIP_INDEX = "garnahata_ownerships" ownership_idx = Index(OWNERSHIP_INDEX) ownership_idx.settings(**BASIC_INDEX_SETTINGS) ownership_idx.analyzer(namesAutocompleteAnalyzer) ownership_idx.analyzer(namesAutocompleteSearchAnalyzer) ownership_idx.analyzer(ukrainianAddressesStopwordsAnalyzer) @ownership_idx.doc_type class Ownership(DocType): """Ownership document.""" addresses = Text(analyzer="ukrainianAddressesStopwordsAnalyzer", copy_to="all") persons = Text(analyzer="ukrainian", copy_to="all") companies = Text(analyzer="ukrainian", copy_to="all") registered = Date() mortgage_registered = Date() names_autocomplete = Text( analyzer="namesAutocompleteAnalyzer", search_analyzer="namesAutocompleteSearchAnalyzer",
try: # Drop index if it exists ind = Index(index, using=client) ind.delete() except NotFoundError: pass # then create it ind.settings(number_of_shards=1) ind.create() ind = Index(index, using=client) # configure default analyzer ind.close() # index must be closed for configuring analyzer ind.analyzer(my_analyzer) # configure the path field so it is not tokenized and we can do exact match search client.indices.put_mapping( doc_type='document', index=index, body={"document": { "properties": { "path": { "type": "keyword", } } }}) ind.save() ind.open()
def buildIndex(): """ buildIndex creates a new film index, deleting any existing index of the same name. It loads a json file containing the covid doc metadata corpus and does bulk loading using a generator function. """ doc_index = Index('covid_doc_index') if doc_index.exists(): doc_index.delete() # Overwrite any previous version doc_index.analyzer( basic_analyzer ) # register your customized analyzer as the default analyzer doc_index.create() # Open the covid metadata corpus with open('covid_comm_use_subset_meta.json', 'r', encoding='utf-8') as data_file: enum_id = 1 documents = {} for line in data_file: try: doc = json.loads(line) for key in doc.keys(): if type(doc.get(key)) is not str: if math.isnan(doc.get(key)): doc.update({key: None}) if key == "publish_time" and doc.get(key) and len( doc.get(key)) > 4: match = re.search(year, doc[key]).group(0) if match: doc.update({key: match}) documents.update({str(enum_id): doc}) enum_id += 1 except json.decoder.JSONDecodeError: continue # load doc metadata from json file into dictionary size = len(documents) # Action series for bulk loading with helpers.bulk function. # Implemented as a generator, to return one movie with each call. # Note that we include the index name here. # The Document type is always 'doc'. # Every item to be indexed must have a unique key. def actions(): # enum_id is an enumerated id created when reading the json and used as key into covid metadata dictionary) for enum_id in range(1, size + 1): yield { "_index": "covid_doc_index", "_type": 'doc', "_id": enum_id, "title": documents[str(enum_id)].get('title', 'None'), "text": documents[str(enum_id)].get('abstract', 'None'), "authors": documents[str(enum_id)].get('authors', 'None'), "publish_time": documents[str(enum_id)].get('publish_time', int(0000)) } helpers.bulk(es, actions())