def insert_documents_to_index(documents, an, index): client = Elasticsearch() idx = Index(index, using=client) if idx.exists(): idx.delete() idx.settings(number_of_shards=1) idx.create() idx = Index(index, using=client) idx.close() idx.analyzer(an) client.indices.put_mapping( doc_type='document', index=index, body={'document': { 'properties': { 'path': { 'type': 'keyword' } } }}) idx.save() idx.open() print('Index settings=', idx.get_settings()) print('Indexing ...') bulk(client, documents)
def _init_index(index_config, force): index = Index(index_config['name']) aliases = {} for alias_val in index_config['alias']: if isinstance(alias_val, basestring): aliases[alias_val] = {} else: aliases[alias_val['name']] = alias_val['config'] index.aliases(**aliases) if force: index.delete(ignore=404) try: index.create() except TransportError as err: if err.status_code == 404: logger.debug('Index already exists, initializing document') index.close() for document_config in index_config['documents']: module_str, class_str = document_config['class'].rsplit('.', 1) module = import_module(module_str) cls = getattr(module, class_str) index.doc_type(cls) cls.init() index.open() return index
def insert_document_to_index(documents, text_an, index, keep): client = Elasticsearch() idx = Index(index, using=client) if idx.exists() and not keep: print('Removing existing index...') idx.delete() if not idx.exists(): print('Creating index') idx.create() idx.close() idx.analyzer(text_an) client.indices.put_mapping( doc_type='document', index=index, body={'document': { 'properties': { 'path': { 'type': 'keyword' } } }}) idx.save() idx.open() print("Index settings=", idx.get_settings()) print('Indexing ...') bulk(client, documents)
def close_index_with_pk(dic_pk): index_name = get_index_name_with_pk(dic_pk) if index_name == 'mdict-': print(dic_pk, 'not exists') return index = Index(index_name) try: index.close() except NotFoundError as e: print(e) print('close', dic_pk, index_name)
def init_models(models): """ Init a model and create the index if not existing """ for _, model_obj in models.items(): # TO BE FIXED waiting for the following issue # https://github.com/elastic/elasticsearch-dsl-py/pull/272 i = Index(model_obj._doc_type.index) if i.exists(): i.close() model_obj.init() i.open()
def initialize(self, using=None): """ Initialize / update doctype """ from .settings import INDEX_DEFAULTS meta = dict(INDEX_DEFAULTS) meta.update(self._meta.meta or {}) _idx = DSLIndex(self._meta.document._doc_type.index) _idx.settings(**meta) if not _idx.exists(): _idx.create() else: static_settings = [ 'number_of_shards', 'codec', 'routing_partition_size' ] not_updateable = [ 'number_of_shards', ] def filter_out_not_updateable(settings): return dict( filter(lambda x: x[0] not in not_updateable, settings.items())) idx_dict = _idx.to_dict() idx_settings = idx_dict.get('settings') or {} idx_analysis = idx_settings.pop('analysis') or {} idx_static = dict( map(lambda x: (x, idx_settings.pop(x)), list(filter(lambda x: x in static_settings, idx_settings)))) idx_settings = filter_out_not_updateable(idx_settings) idx_static = filter_out_not_updateable(idx_static) if idx_settings: _idx.put_settings(body=idx_settings, preserve_existing=True) try: _idx.close() _idx.put_settings(body={'analysis': idx_analysis}, preserve_existing=True) if idx_static: _idx.put_settings(body=idx_static, preserve_existing=True) finally: _idx.open() self._meta.document.init(using=using)
def handle(self, *args, **options): text_analyzer = get_text_analyzer("german") elastic_index = Index("mst_debug") if not elastic_index.exists(): elastic_index.create() elastic_index.close() elastic_index.analyzer(text_analyzer) elastic_index.save() elastic_index.open() elastic_index.flush() for word in options["words"]: analysis = elastic_index.analyze(body={ "analyzer": "text_analyzer", "text": word }) tokens = [i["token"] for i in analysis["tokens"]] self.stdout.write("{} {}\n".format(word, tokens))
def open_spider(self, spider): self.client = Elasticsearch() try: # Drop index if it exists ind = Index(self.elastic_db, using=self.client) ind.delete() except NotFoundError: pass # then create it ind.create() ind.close() # Configure tokenizer my_analyzer = analyzer('default', type='custom', tokenizer=tokenizer('standard'), filter=['lowercase', 'asciifolding']) ind.analyzer(my_analyzer) ind.save() ind.open()
class ElasticSearchIndex: def __init__(self, name='qb', similarity='default', bm25_b=None, bm25_k1=None): self.name = name self.ix = Index(self.name) self.answer_doc = create_doctype(self.name, similarity) if bm25_b is None: bm25_b = .75 if bm25_k1 is None: bm25_k1 = 1.2 self.bm25_b = bm25_b self.bm25_k1 = bm25_k1 def delete(self): try: self.ix.delete() except elasticsearch.exceptions.NotFoundError: log.info('Could not delete non-existent index.') def exists(self): return self.ix.exists() def init(self): self.ix.create() self.ix.close() self.ix.put_settings( body={ 'similarity': { 'qb_bm25': { 'type': 'BM25', 'b': self.bm25_b, 'k1': self.bm25_k1 } } }) self.ix.open() self.answer_doc.init(index=self.name) def build_large_docs(self, documents: Dict[str, str], use_wiki=True, use_qb=True, rebuild_index=False): if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))): log.info(f'Deleting index: {self.name}') self.delete() if self.exists(): log.info(f'Index {self.name} exists') else: log.info(f'Index {self.name} does not exist') self.init() wiki_lookup = Wikipedia() log.info( 'Indexing questions and corresponding wikipedia pages as large docs...' ) for page in tqdm.tqdm(documents): if use_wiki and page in wiki_lookup: wiki_content = wiki_lookup[page].text else: wiki_content = '' if use_qb: qb_content = documents[page] else: qb_content = '' answer = self.answer_doc(page=page, wiki_content=wiki_content, qb_content=qb_content) answer.save(index=self.name) def build_many_docs(self, pages, documents, use_wiki=True, use_qb=True, rebuild_index=False): if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))): log.info(f'Deleting index: {self.name}') self.delete() if self.exists(): log.info(f'Index {self.name} exists') else: log.info(f'Index {self.name} does not exist') self.init() log.info( 'Indexing questions and corresponding pages as many docs...') if use_qb: log.info('Indexing questions...') for page, doc in tqdm.tqdm(documents): self.answer_doc(page=page, qb_content=doc).save() if use_wiki: log.info('Indexing wikipedia...') wiki_lookup = Wikipedia() for page in tqdm.tqdm(pages): if page in wiki_lookup: content = word_tokenize(wiki_lookup[page].text) for i in range(0, len(content), 200): chunked_content = content[i:i + 200] if len(chunked_content) > 0: self.answer_doc(page=page, wiki_content=' '.join( chunked_content)).save() def search(self, text: str, max_n_guesses: int, normalize_score_by_length=False, wiki_boost=1, qb_boost=1): if not self.exists(): raise ValueError( 'The index does not exist, you must create it before searching' ) if wiki_boost != 1: wiki_field = 'wiki_content^{}'.format(wiki_boost) else: wiki_field = 'wiki_content' if qb_boost != 1: qb_field = 'qb_content^{}'.format(qb_boost) else: qb_field = 'qb_content' s = Search(index=self.name)[0:max_n_guesses].query( 'multi_match', query=text, fields=[wiki_field, qb_field]) results = s.execute() guess_set = set() guesses = [] if normalize_score_by_length: query_length = len(text.split()) else: query_length = 1 for r in results: if r.page in guess_set: continue else: guesses.append((r.page, r.meta.score / query_length)) return guesses
filter=args.filter) try: # Drop index if it exists ind = Index(index, using=client) ind.delete() except NotFoundError: pass # then create it ind.settings(number_of_shards=1) ind.create() ind = Index(index, using=client) # configure default analyzer ind.close() # index must be closed for configuring analyzer ind.analyzer(my_analyzer) # configure the path field so it is not tokenized and we can do exact match search client.indices.put_mapping( doc_type='document', index=index, body={"document": { "properties": { "path": { "type": "keyword", } } }}) ind.save()
class ElasticSearchIndex: def __init__(self, name='qb', similarity='default', bm25_b=None, bm25_k1=None): self.name = name self.ix = Index(self.name) self.answer_doc = create_doctype(self.name, similarity) if bm25_b is None: bm25_b = .75 if bm25_k1 is None: bm25_k1 = 1.2 self.bm25_b = bm25_b self.bm25_k1 = bm25_k1 def delete(self): try: self.ix.delete() except elasticsearch.exceptions.NotFoundError: log.info('Could not delete non-existent index.') def exists(self): return self.ix.exists() def init(self): self.ix.create() self.ix.close() self.ix.put_settings(body={'similarity': { 'qb_bm25': {'type': 'BM25', 'b': self.bm25_b, 'k1': self.bm25_k1}} }) self.ix.open() self.answer_doc.init(index=self.name) def build_large_docs(self, documents: Dict[str, str], use_wiki=True, use_qb=True, rebuild_index=False): if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))): log.info(f'Deleting index: {self.name}') self.delete() if self.exists(): log.info(f'Index {self.name} exists') else: log.info(f'Index {self.name} does not exist') self.init() wiki_lookup = Wikipedia() log.info('Indexing questions and corresponding wikipedia pages as large docs...') for page in tqdm.tqdm(documents): if use_wiki and page in wiki_lookup: wiki_content = wiki_lookup[page].text else: wiki_content = '' if use_qb: qb_content = documents[page] else: qb_content = '' answer = self.answer_doc( page=page, wiki_content=wiki_content, qb_content=qb_content ) answer.save(index=self.name) def build_many_docs(self, pages, documents, use_wiki=True, use_qb=True, rebuild_index=False): if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))): log.info(f'Deleting index: {self.name}') self.delete() if self.exists(): log.info(f'Index {self.name} exists') else: log.info(f'Index {self.name} does not exist') self.init() log.info('Indexing questions and corresponding pages as many docs...') if use_qb: log.info('Indexing questions...') for page, doc in tqdm.tqdm(documents): self.answer_doc(page=page, qb_content=doc).save() if use_wiki: log.info('Indexing wikipedia...') wiki_lookup = Wikipedia() for page in tqdm.tqdm(pages): if page in wiki_lookup: content = word_tokenize(wiki_lookup[page].text) for i in range(0, len(content), 200): chunked_content = content[i:i + 200] if len(chunked_content) > 0: self.answer_doc(page=page, wiki_content=' '.join(chunked_content)).save() def search(self, text: str, max_n_guesses: int, normalize_score_by_length=False, wiki_boost=1, qb_boost=1): if not self.exists(): raise ValueError('The index does not exist, you must create it before searching') if wiki_boost != 1: wiki_field = 'wiki_content^{}'.format(wiki_boost) else: wiki_field = 'wiki_content' if qb_boost != 1: qb_field = 'qb_content^{}'.format(qb_boost) else: qb_field = 'qb_content' s = Search(index=self.name)[0:max_n_guesses].query( 'multi_match', query=text, fields=[wiki_field, qb_field] ) results = s.execute() guess_set = set() guesses = [] if normalize_score_by_length: query_length = len(text.split()) else: query_length = 1 for r in results: if r.page in guess_set: continue else: guesses.append((r.page, r.meta.score / query_length)) return guesses
class IrIndex: def __init__(self, name='qb', similarity='default', bm25_b=None, bm25_k1=None): self.name = name self.ix = Index(self.name) self.answer_doc = create_doctype(self.name, similarity) if bm25_b is None: bm25_b = .75 if bm25_k1 is None: bm25_k1 = 1.2 self.bm25_b = bm25_b self.bm25_k1 = bm25_k1 def delete(self): try: self.ix.delete() except elasticsearch.exceptions.NotFoundError: log.info('Could not delete non-existent index.') def exists(self): return self.ix.exists() def init(self): self.ix.create() self.ix.close() self.ix.put_settings( body={ 'similarity': { 'qb_bm25': { 'type': 'BM25', 'b': self.bm25_b, 'k1': self.bm25_k1 } } }) self.ix.open() self.answer_doc.init(index=self.name) def build(self, qb_docs: Dict[str, str], asr_docs: Dict[str, str], use_wiki=False, use_qb=True, use_asr=True, rebuild_index=False): if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))): # pylint: disable=invalid-envvar-default log.info(f'Deleting index: {self.name}') self.delete() if self.exists(): log.info(f'Index {self.name} exists') else: log.info(f'Index {self.name} does not exist') self.init() # wiki_lookup = Wikipedia() log.info('Indexing...') for page in tqdm.tqdm(qb_docs): wiki_content = '' # if use_wiki and page in wiki_lookup: # wiki_content = wiki_lookup[page].text # else: # wiki_content = '' if use_qb: qb_content = qb_docs[page] else: qb_content = '' if use_asr: asr_content = asr_docs[page] else: asr_content = '' answer = self.answer_doc(page=page, wiki_content=wiki_content, qb_content=qb_content, asr_content=asr_content) answer.save(index=self.name) def search(self, text: str, max_n_guesses: int, normalize_score_by_length=False): if not self.exists(): raise ValueError( 'The index does not exist, you must create it before searching' ) wiki_field = 'wiki_content' qb_field = 'qb_content' asr_field = 'asr_content' s = Search(index=self.name)[0:max_n_guesses].query( # pylint: disable=no-member 'multi_match', query=text, fields=[wiki_field, qb_field, asr_field]) results = s.execute() guess_set = set() guesses = [] if normalize_score_by_length: query_length = max(1, len(text.split())) else: query_length = 1 for r in results: if r.page in guess_set: continue else: guesses.append({ 'guess': r.page, 'score': r.meta.score, 'length': query_length }) if len(guesses) == 0: return {'guess': '~~~NOGUESS~~~', 'score': 0, 'length': 1} else: return guesses[0]
def createIndex(isUpdate=True): # 5. Создаем класс для хранения Адресов address = Index(address.INDEX) address.close() houses = Index(houses.INDEX) @houses.document class House(Document, InnerDoc): houseId = Keyword() house_num = Keyword() build_num = Keyword() str_num = Keyword() postal_code = Keyword() ifns_fl = Keyword() ifns_ul = Keyword() counter = Keyword() @address.document class Address(Document): ao_guid = Keyword(required=True) parent_guid = Keyword() act_status = Integer() curr_status = Integer() live_status = Integer() oper_status = Integer() formal_name = Keyword() off_name = Keyword() short_name = Keyword() region_code = Keyword() ao_level = Keyword() area_code = Keyword() auto_code = Keyword() extr_code = Keyword() city_ar_code = Keyword() city_code = Keyword() street_code = Keyword() plan_code = Keyword() place_code = Keyword() sub_ext_code = Keyword() plain_code = Keyword() code = Keyword() okato = Keyword() oktmo = Keyword() postal_code = Keyword() terr_ifns_fl = Keyword() terr_ifns_ul = Keyword() ifns_fl = Keyword() ifns_ul = Keyword() norm_doc = Keyword() district = Keyword() district_type = Keyword() settlement = Keyword() settlement_type = Keyword() street = Keyword() street_type = Keyword() start_date = Date() end_date = Date() update_date = Date() street_address_suggest = Text(analyzer="autocomplete") full_address = Keyword() district_full = Keyword() settlement_full = Keyword() street_full = Keyword() houses = Nested(House) def add_house(self, house_num, build_num): self.houses.append(House(house_num=house_num, build_num=build_num)) def save(self, **kwargs): return super().save(**kwargs) Address.init() address.open() queryAllStreet = { "query": { "bool": { "must": [{ "match_all": {} }], "filter": { "term": { "ao_level": "7" } } } } } queryByRegion = { "query": { "bool": { "must": [{ "term": { "region_code": { "value": "13" } } }], "filter": { "term": { "ao_level": "7" } } } } } update_date = str(fiases.fias_data.VERSION_DATE) \ + fiases.fias_data.DATE_TIME_ZONE queryUpdate = { "query": { "bool": { "must": [{ "term": { "update_date": { "value": update_date } } }], "filter": { "term": { "ao_level": "7" } } } } } # Выбираем все улицы if isUpdate: print("indexing ...") scanResStreet = scan(ES, scroll='1h', query=queryUpdate, index=address.INDEX) ADDR_UPDATE_CNT = Address.search()\ .query("term", update_date=update_date)\ .filter("term", ao_level="7").count() else: print("Full indexing ...") scanResStreet = scan(ES, scroll='1h', query=queryAllStreet, index=address.INDEX) ADDR_UPDATE_CNT = Address.search()\ .query("term", ao_level="7").count() print("ADDR_UPDATE_CNT: ", ADDR_UPDATE_CNT) # Обновляем индеск street_address_suggest addrSearch = Address.search() homeSearch = House.search() print("address: ", Address.search().count()) print("houses: ", House.search().count()) houseList = [] for address in tqdm(scanResStreet, unit=' address', desc='indexed', total=ADDR_UPDATE_CNT): # source = address['_source'] # Получаем улицу street = Address.get(address['_id']) try: # Находим город city = addrSearch.query("match", ao_guid=street.parent_guid).execute()[0] if (not city.parent_guid): # Для Москвы, Питера и Севастополя регион равен городу. district = city else: # Находим регион district = addrSearch.query( "match", ao_guid=city.parent_guid).execute()[0] except (Exception): print() print("Ошибка индексации: ") print("city: " + city) print() print("city.parent_guid: " + str((city.parent_guid == False))) print(address['_id']) print(street.short_name + "." + street.off_name.lower().strip() + ", " + city.short_name + "." + city.off_name.lower().strip() + ", " + district.short_name.lower().strip() + "." + district.off_name.lower().strip()) print() continue else: houses = homeSearch.filter("term", ao_guid=street.ao_guid) for house in houses.scan(): houseList.append(house) try: if (street.postal_code): postal_code = street.postal_code + ', ' else: postal_code = '' street.update( street_type=street.short_name.strip(), street=street.off_name.strip(), settlement=city.off_name.strip(), settlement_type=city.short_name.strip(), district=district.off_name.strip(), district_type=district.short_name.strip(), street_address_suggest=district.off_name.lower().strip() + " " + city.off_name.lower().strip() + " " + street.off_name.lower().strip(), full_address=postal_code + district.short_name + ' ' + district.off_name + ', ' + city.short_name + ' ' + city.off_name + ', ' + street.short_name + ' ' + street.off_name, houses=houseList) houseList[:] = [] except (Exception): print(house) return ADDR_UPDATE_CNT print("finish")