def insert_document_to_index(documents, text_an, index, keep): client = Elasticsearch() idx = Index(index, using=client) if idx.exists() and not keep: print('Removing existing index...') idx.delete() if not idx.exists(): print('Creating index') idx.create() idx.close() idx.analyzer(text_an) client.indices.put_mapping( doc_type='document', index=index, body={'document': { 'properties': { 'path': { 'type': 'keyword' } } }}) idx.save() idx.open() print("Index settings=", idx.get_settings()) print('Indexing ...') bulk(client, documents)
def test_complete(sch, database, elasticsearch): """A test of the complete functioning of the pipeline.""" pipeline = sch.pipelines.get(name='Tour de France Data to Elasticsearch') SAMPLE_DATA = [dict(year=1903, rank=1, name='MAURICE GARIN', number=1, team='TDF 1903', time='94h 33m 14s', hours=94, mins=33, secs=14), dict(year=1903, rank=2, name='LUCIEN POTHIER', number=37, team='TDF 1903', time='97h 32m 35s', hours=97, mins=32, secs=35), dict(year=1903, rank=3, name='FERNAND AUGEREAU', number=39, team='TDF 1903', time='99h 02m 38s', hours=99, mins=2, secs=38)] EXPECTED_RECORDS = [dict(year=1903, rank=1, firstName='Maurice', lastName='Garin', number=1, team='TDF 1903', time='94h 33m 14s', hours=94, mins=33, secs=14), dict(year=1903, rank=2, firstName='Lucien', lastName='Pothier', number=37, team='TDF 1903', time='97h 32m 35s', hours=97, mins=32, secs=35), dict(year=1903, rank=3, firstName='Fernand', lastName='Augereau', number=39, team='TDF 1903', time='99h 02m 38s', hours=99, mins=2, secs=38)] table_name = get_random_string() index = get_random_string(string.ascii_lowercase) table = sqlalchemy.Table(table_name, sqlalchemy.MetaData(), sqlalchemy.Column('id', sqlalchemy.Integer, primary_key=True), sqlalchemy.Column('year', sqlalchemy.Integer), sqlalchemy.Column('rank', sqlalchemy.Integer), sqlalchemy.Column('name', sqlalchemy.String(100)), sqlalchemy.Column('number', sqlalchemy.Integer), sqlalchemy.Column('team', sqlalchemy.String(100)), sqlalchemy.Column('time', sqlalchemy.String(100)), sqlalchemy.Column('hours', sqlalchemy.Integer), sqlalchemy.Column('mins', sqlalchemy.Integer), sqlalchemy.Column('secs', sqlalchemy.Integer)) try: logger.info('Creating table (%s) in database ...', table_name) table.create(database.engine) logger.info('Inserting sample data ...') connection = database.engine.connect() connection.execute(table.insert(), SAMPLE_DATA) runtime_parameters = dict(JDBC_CONNECTION_STRING=database.jdbc_connection_string, JDBC_USERNAME=database.username, JDBC_PASSWORD=database.password, ELASTICSEARCH_URI=f'{elasticsearch.hostname}:{elasticsearch.port}', ELASTICSEARCH_CREDENTIALS=f'{elasticsearch.username}:{elasticsearch.password}', ELASTICSEARCH_INDEX=index, TABLE_NAME_PATTERN=f'%{table_name}%') with sch.run_test_job(pipeline, runtime_parameters, data_collector_labels=sch.data_collector_labels) as job: time.sleep(10) data_in_elasticsearch = [hit.to_dict() for hit in elasticsearch.search(index=index).sort('rank').execute()] assert EXPECTED_RECORDS == data_in_elasticsearch finally: index = Index(index, using=elasticsearch.client) if index.exists(): logger.info('Deleting Elasticsearch index %s ...', index) index.delete() logger.info('Dropping table %s ...', table_name) table.drop(database.engine)
def _init_index(index_config, force): index = Index(index_config['name']) aliases = {} for alias_val in index_config['alias']: if isinstance(alias_val, basestring): aliases[alias_val] = {} else: aliases[alias_val['name']] = alias_val['config'] index.aliases(**aliases) if force: index.delete(ignore=404) try: index.create() except TransportError as err: if err.status_code == 404: logger.debug('Index already exists, initializing document') index.close() for document_config in index_config['documents']: module_str, class_str = document_config['class'].rsplit('.', 1) module = import_module(module_str) cls = getattr(module, class_str) index.doc_type(cls) cls.init() index.open() return index
def setup_db(): test_domain = initialize_domain() # Create all indexes from .elements import Person, Alien, User, ComplexUser, Provider test_domain.register(Person) test_domain.register(Alien) test_domain.register(User) test_domain.register(ComplexUser) test_domain.register(Provider) provider = test_domain.get_provider('default') conn = provider.get_connection() for _, aggregate_record in test_domain.aggregates.items(): index = Index(aggregate_record.cls.meta_.schema_name, using=conn) if not index.exists(): index.create() yield # Drop all indexes at the end of test suite for _, aggregate_record in test_domain.aggregates.items(): index = Index(aggregate_record.cls.meta_.schema_name, using=conn) if index.exists(): index.delete()
def user_index(app): """Initialize the `User` doc type.""" test_index = Index(uuid4().hex) test_index.create() app.cluster.health(wait_for_status='yellow') # monkey patch `auth_index` original_auth_index = auth_models.auth_index auth_models.auth_index = test_index User.init(index=test_index._name) yield test_index auth_models.auth_index = original_auth_index # Remove all `User`s. # # [Don't use delete-by-query to clean out all or most documents in an # index. Rather create a new index...] # (https://www.elastic.co/guide/en/elasticsearch/plugins/2.2/plugins-delete-by-query.html) # # [It is no longer possible to delete the mapping for a type. Instead you # should delete the index and recreate it with the new mappings.] # (https://www.elastic.co/guide/en/elasticsearch/reference/2.2/indices-delete-mapping.html) test_index.delete()
def _create_index(cls, index_name): new_index = Index(index_name, using=CONNECTION_ALIAS) new_index.delete(ignore=[400, 404]) new_index.settings(index=DEFAULT_INDEX_SETTING) new_index.create() cls.init(index=index_name) return new_index
def create_index(): index = Index(settings.INDEX) index.delete(ignore=404) for t in [Action, Contact, Run]: index.doc_type(t) index.create() load_flows()
def delete_failed_index(mdx, dic_pk): try: print('try delete failed index', get_index_name_with_pk(dic_pk)) index = Index(get_index_name_with_pk(dic_pk)) index.delete() except Exception as e: write_exception_error(mdx, dic_pk, e)
def rebuild_platforms(self): galaxy_platforms = Index('galaxy_platforms') galaxy_platforms.doc_type(PlatformDoc) galaxy_platforms.delete(ignore=404) galaxy_platforms.create() for platform in Platform.objects.filter( active=True).distinct('name').all(): alias_list = [ alias for alias in self.get_platform_search_terms(platform.name) ] alias_list = '' if len(alias_list) == 0 else alias_list release_list = [ p.release for p in Platform.objects.filter( active=True, name=platform.name).order_by( 'release').distinct('release').all() ] search_name = 'Enterprise_Linux' if platform.name == 'EL' else platform.name doc = PlatformDoc( name=search_name, releases=release_list, roles=Role.objects.filter( active=True, is_valid=True, platforms__name=platform.name).order_by( 'namespace', 'name').distinct('namespace', 'name').count(), alias=alias_list, autocomplete="%s %s %s" % (search_name, ' '.join(release_list), ' '.join(alias_list))) doc.save()
def buildIndex(): song_index = Index('song_index') if song_index.exists(): song_index.delete() # Overwrite any previous version song_index.doc_type(Song) # Set doc_type to Movie song_index.create() # Open the json film corpus with open('./data/data_v3.json') as data_file: songs = json.load(data_file) size = len(songs) # Action series for bulk loading actions = [{ "_index": "song_index", "_type": "song", "_id": mid, "title": songs[str(mid)]['title'], "description": songs[str(mid)]['description'], "type": songs[str(mid)]['type'], "song_name": songs[str(mid)]['song_name'], "album_name": (songs[str(mid)]['album_name']), "artist": (songs[str(mid)]['artist']), "rank": (songs[str(mid)]['rank']), "charts": songs[str(mid)]['charts'], "image_link": songs[str(mid)]['image_link'], "category": list2str(songs[str(mid)]['category']), } for mid in range(1, size + 1)] helpers.bulk(es, actions)
def es_delete_cmd(index_name): """Delete a specified index :arg index_name: name of index to delete """ indexes = [name for name, count in get_indexes()] if index_name not in indexes: log.error('Index "%s" is not a valid index.', index_name) if not indexes: log.error('There are no valid indexes.') else: log.error('Valid indexes: %s', ', '.join(indexes)) return ret = raw_input('Are you sure you want to delete "%s"? (yes/no) ' % index_name) if ret != 'yes': return log.info('Deleting index "%s"...', index_name) index = Index(name=index_name, using='default') try: index.delete() except NotFoundError: pass log.info('Done!')
def create_delete_index(**kwargs): from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT_EVAL from mainapp.documents import DocumentEval from util.util import shards_mapping from elasticsearch_dsl import Index crit_or_class_ids = kwargs['crit_or_class_ids'] is_criterion = kwargs['is_criterion'] perform_actualize = kwargs['perform_actualize'] topic_modelling_name = kwargs['topic_modelling_name'] scored_documents = kwargs['scored_documents'] for crit_id in crit_or_class_ids: if not perform_actualize: es_index = Index( f"{ES_INDEX_DOCUMENT_EVAL}_{topic_modelling_name}_{crit_id}{'_m4a' if is_criterion else '_m4a_class'}", using=ES_CLIENT) es_index.delete(ignore=404) if not ES_CLIENT.indices.exists( f"{ES_INDEX_DOCUMENT_EVAL}_{topic_modelling_name}_{crit_id}{'_m4a' if is_criterion else '_m4a_class'}" ): settings = DocumentEval.Index.settings settings['number_of_shards'] = shards_mapping( scored_documents.shape[0]) ES_CLIENT.indices.create( index= f"{ES_INDEX_DOCUMENT_EVAL}_{topic_modelling_name}_{crit_id}{'_m4a' if is_criterion else '_m4a_class'}", body={ "settings": settings, "mappings": DocumentEval.Index.mappings })
def buildIndex(): film_index = Index('sample_film_index') if film_index.exists(): film_index.delete() # Overwrite any previous version film_index.doc_type(Movie) # Set doc_type to Movie film_index.create() # Open the json film corpus with open('films_corpus.json') as data_file: movies = json.load(data_file) size = len(movies) # Action series for bulk loading actions = [ { "_index": "sample_film_index", "_type": "movie", "_id": mid, "title":movies[str(mid)]['title'], "text":movies[str(mid)]['text'], "starring":movies[str(mid)]['starring'], "runtime": get_runtime(movies[str(mid)]['runtime']), #movies[str(mid)]['runtime'] # You would like to convert runtime to integer (in minutes) # --- Add more fields here --- "language": movies[str(mid)]['language'], "country": movies[str(mid)]['country'], "director": movies[str(mid)]['director'], "location": movies[str(mid)]['location'], "time": movies[str(mid)]['time'], "categories": movies[str(mid)]['categories'] } for mid in range(1, size+1) ] helpers.bulk(es, actions)
def main(es_index_name, file_to_index): """ Build an elasticsearch index over the Docterms data using bulk load """ start_time = time.time() #buildIndex() index = Index(es_index_name) print "[main]After index(es_index_name)" if index.exists(): index.delete() # Overwrite any previous version #index.doc_type(DocTerms) index.doc_type(DocTerms) print "[main]After index.doc_type." #index.doc_type("doc_terms") index.create() print "[main]Created index." #exit() #docterms_create_index(es, es_index_name) stream = docterms_document_stream(file_to_index, es_index_name) print "[main]Calling bulk loader." helpers.bulk(es, stream) #for result in docterms_document_stream(file_to_index): # print "result: %s" % (result) es.indices.refresh(index=es_index_name) print("[main]Built index in %s seconds ===" % (time.time() - start_time))
def addIndex(gI): f = "output" + str(gI) ftxt = codecs.open(f, "r", encoding='iso-8859-1') text = '' for line in ftxt: text += line # Insert operation for a document with fields' path' and 'text' ldocs = [] ldocs.append({'_op_type': 'index', '_index': f, '_type': 'document', 'path': f, 'text': text}) # Working with ElasticSearch client = Elasticsearch() try: # Drop index if it exists ind = Index(f, using=client) ind.delete() except NotFoundError: pass # then create it ind.settings(number_of_shards=1) ind.create() # Bulk execution of elasticsearch operations (faster than executing all one by one) print('Indexing ...') bulk(client, ldocs)
def handle(self, *args, **kwargs): es = connections.get_connection() for name, model in MODELS: print('Processing %s:' % name) document = model.es_doc index_name = document._doc_type.index index = Index(index_name) index.settings(**settings.ES_INDEXES_SETTINGS) index.doc_type(document) print(' - Deleting index.') index.delete(ignore=404) print(' - Creating index.') index.create() total = model.objects.count() if total == 0: print(' - No %s to index.' % name) continue progress_bar = tqdm( total=total, bar_format= ' - Indexing: {n_fmt}/{total_fmt} [{elapsed} < {remaining}]', ncols=1, # required to show the custom bar_format ) for _ in streaming_bulk( es, (obj.get_es_data() for obj in model.objects.all().iterator()), index=index_name, doc_type=document._doc_type.name, ): progress_bar.update(1) progress_bar.close()
class BaseSearchTestCase(TestCase): def setUp(self): from django.conf import settings SEARCH = getattr(settings, 'SEARCH') connections.create_connection('testing', **SEARCH['default']['connections']) self.index = Index(SEARCH['default']['index'], using='testing') # This is needed for test_documents, but has side effects in all running tests doctypes_list = ( value for name, value in inspect.getmembers(documents) if not name.startswith('_') and inspect.isclass(value) and issubclass(value, DocType) and name != DocType.__name__ ) for doctype in doctypes_list: # Remove assigned index doctype._doc_type.index = None # Associate docs with test index self.index.doc_type(doctype) if self.index.exists(): self.index.delete(ignore=404) self.index.create() self.search = Search(index=SEARCH['default']['index']) def tearDown(self): self.index.delete() queue = django_rq.get_queue() queue.empty()
def insert_documents_to_index(documents, an, index): client = Elasticsearch() idx = Index(index, using=client) if idx.exists(): idx.delete() idx.settings(number_of_shards=1) idx.create() idx = Index(index, using=client) idx.close() idx.analyzer(an) client.indices.put_mapping( doc_type='document', index=index, body={'document': { 'properties': { 'path': { 'type': 'keyword' } } }}) idx.save() idx.open() print('Index settings=', idx.get_settings()) print('Indexing ...') bulk(client, documents)
def build_query_Index(): query_index = Index('query_index') query_index.document(SearchQuery) if query_index.exists(): query_index.delete() query_index.create() SearchQuery.init()
def buildIndex(): Disease_index = Index('test_rare_disease_index') if Disease_index.exists(): Disease_index.delete() # Overwrite any previous version Disease_index.doc_type(Disease) # Set doc_type to Disease Disease_index.create() # Open the json film corpus with open('disease_data.json') as data_file: diseases = json.load(data_file) size = len(diseases) # Action series for bulk loading actions = [ { "_index": "test_rare_disease_index", "_type": "disease", "_id": mid, "disease_type": diseases[str(mid)]['disease_type'], "name": diseases[str(mid)]['name'], "introduction": diseases[str(mid)]['introduction'], "symptoms": diseases[str(mid)]['symptoms'], "causes": diseases[str(mid)]['causes'], "treatment": diseases[str(mid)]['treatment'], "diagnosis": diseases[str(mid)]['diagnosis'], "affected_populations": diseases[str(mid)]['affected_populations'], #diseases[str(mid)]['runtime'] # You would like to convert runtime to integer (in minutes) # --- Add more fields here --- } for mid in range(1, size + 1) ] helpers.bulk(es, actions)
def setup_db(): domain = initialize_domain() with domain.domain_context(): # Create all indexes # Local/Relative Imports from .elements import Alien, ComplexUser, Person, User domain.register(Person) domain.register(Alien) domain.register(User) domain.register(ComplexUser) provider = domain.get_provider("default") conn = provider.get_connection() for _, aggregate_record in domain.registry.aggregates.items(): index = Index(aggregate_record.cls.meta_.schema_name, using=conn) if not index.exists(): index.create() yield # Drop all indexes at the end of test suite for _, aggregate_record in domain.registry.aggregates.items(): index = Index(aggregate_record.cls.meta_.schema_name, using=conn) if index.exists(): index.delete()
def es_delete_cmd(index_name): """Delete a specified index :arg index_name: name of index to delete """ indexes = [name for name, count in get_indexes()] if index_name not in indexes: log.error('Index "%s" is not a valid index.', index_name) if not indexes: log.error('There are no valid indexes.') else: log.error('Valid indexes: %s', ', '.join(indexes)) return ret = raw_input( 'Are you sure you want to delete "%s"? (yes/no) ' % index_name ) if ret != 'yes': return log.info('Deleting index "%s"...', index_name) index = Index(name=index_name, using='default') try: index.delete() except NotFoundError: pass log.info('Done!')
def create_index(index_name): """ Create a new index in Elasticsearch. The new index will be identified by the value of 'index_name'. index_name: name of index to insert documents. Valid arguments: projects, publications """ if index_name not in ['projects', 'publications', 'appdata']: raise (Exception(f"'{index_name}' is not a valid index name.")) if index_name == 'projects': model = models.Project elif index_name == 'publications': model = models.Publication elif index_name == 'appdata': model = models.appdata else: raise (Exception) # initialize index idx = Index(index_name, using=client) # register a document with the index idx.document(model) # delete the index, ignore if it doesn't exist idx.delete(ignore=404) # create the index in elasticsearch idx.create()
def build_index(): """ Main function of this module. Build the covid relation index. :return: None """ covid_index = Index('covid_relation_index') if covid_index.exists(): # Overwrite any previous version covid_index.delete() covid_index.document(RelationDocument) covid_index.create() metadata = {} with open('data/relations.csv', newline='') as csvfile: reader = csv.reader(csvfile) header = next(reader) # For each value in the CSV file, create a dictionary entry to store # it's information in the appropriate place. for i, cols in enumerate(reader): metadata[str(i + 1)] = {} for key, col in zip(header, cols): # If the value type is triple, decompose the value # into arg subcomponents. if key == 'triple': # Get the predicate and the arguments. predicate, *args = col[2:-2].replace('\'', '').split(',') metadata[str(i + 1)]['predicate'] = predicate metadata[str(i + 1)]['arguments'] = args else: metadata[str(i + 1)][key] = col def actions(): for rel_id in range(1, len(metadata) + 1): dict_id = str(rel_id) try: yield { '_index': 'covid_relation_index', '_type': '_doc', '_id': rel_id, # The DOI allows us to link directly to the article's page where it's hosted. 'doi': metadata[dict_id]['doi'], # The doc_id refers to the CORD-NER-corpus.json dataset. This field is unused in our web app. 'doc_id': metadata[dict_id]['doc_id'], # Sent refers to the sentence from which the document was drawn. 'sent': metadata[dict_id]['sent'], # Predicate refers to the predicate as explained in the RelationDocument class 'predicate': metadata[dict_id]['predicate'], # Argument refers to the arguments as explained in the RelationDocument class 'arguments': metadata[dict_id]['arguments'], } except ValueError: continue except KeyError: continue helpers.bulk(es, actions())
def test_offset_upgrade(sdc_builder, sdc_executor, elasticsearch): """Ensure that when upgrading from older offset format (that can be generated by either SCH or by upgrading pre-multithreaded pipeline) we properly upgrade the offset and the pipeline will not re-read everything from the source. """ es_index = get_random_string(string.ascii_letters, 10).lower() es_doc_id = get_random_string(string.ascii_letters, 10) raw_str = 'Hello World!' builder = sdc_builder.get_pipeline_builder() es_origin = builder.add_stage('Elasticsearch', type='origin') es_origin.set_attributes(index=es_index, query="{'query': {'match_all': {}}}") trash = builder.add_stage('Trash') es_origin >> trash pipeline = builder.build().configure_for_environment(elasticsearch) sdc_executor.add_pipeline(pipeline) # We hard code offset to be pre-migration to multi-threaded origin and thus forcing the origin to upgrade it offset = { 'offsets': { '$com.streamsets.datacollector.pollsource.offset$': None, }, 'version': 2 } sdc_executor.api_client.update_pipeline_committed_offsets(pipeline.id, body=offset) try: # Put data to Elasticsearch elasticsearch.connect() doc_type = DocType(meta={'id': es_doc_id, 'index': es_index}) doc_type.body = raw_str doc_type.save() # save document to Elasticsearch index = Index(es_index) assert index.refresh( ) # assert to refresh index, making all operations available for search # Run pipeline and assert snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True).snapshot # no need to stop pipeline - as ES origin shuts off once data is read from Elasticsearch snapshot_data = snapshot[es_origin.instance_name].output[0].field # assert ES meta assert snapshot_data['_index'] == es_index and snapshot_data[ '_id'] == es_doc_id # assert ES data assert snapshot_data['_source']['body'] == raw_str # Now let's validate that the offset doesn't have the poll key any more offset = sdc_executor.api_client.get_pipeline_committed_offsets( pipeline.id).response.json() assert offset is not None assert '$com.streamsets.datacollector.pollsource.offset$' not in offset[ 'offsets'] finally: # Clean up test data in ES idx = Index(es_index) idx.delete()
def blog_index(): tmp_index = Index(uuid4().hex) tmp_index.create() yield tmp_index._name tmp_index.delete()
def handle(self, *args, **options): companies = Index('companies') companies.delete(ignore=404) companies.doc_type(CompanyDocType) companies.analyzer(analyzer('english')) companies.create() management.call_command('populate_elasticsearch')
def migrate(): hidden_services = Index('hiddenservices') hidden_services.delete(ignore=404) hidden_services = Index('hiddenservices') hidden_services.doc_type(DomainDocType) hidden_services.doc_type(PageDocType) hidden_services.settings(number_of_shards=8, number_of_replicas=1) hidden_services.create()
def set_up(name: str, class_name, create: bool = False): """Register mappings with index, optionally delete and create the index""" index = Index(name) index.document(class_name) if create: index.delete(ignore=404) index.create()
def run(self, *args, **options): self.confirm( u"Are you really sure you want to delete the index '{0}' ?".format( self.index_name)) index = Index(self.index_name) if not self.dry_run: index.delete() self.print_success(u"Index {0} deleted.".format(self.index_name))
def test_elasticsearch_pipeline_errors(sdc_builder, sdc_executor, elasticsearch): """Test for a pipeline's error records being pumped to Elasticsearch. We do so by making a Dev Raw Data source target to Error stage which would send records to the pipeline configured Elasticsearch error records handling. We then assert the error records what we find in Elasticsearch. The pipeline would look like: Elasticsearch error pipeline: dev_raw_data_source >> error_target """ # Test static es_index = get_random_string( string.ascii_letters, 10).lower() # Elasticsearch indexes must be lower case es_mapping = get_random_string(string.ascii_letters, 10) es_doc_id = get_random_string(string.ascii_letters, 10) raw_str = 'Hello World!' # Build pipeline builder = sdc_builder.get_pipeline_builder() errstg = builder.add_error_stage('Write to Elasticsearch') errstg.set_attributes(document_id=es_doc_id, index=es_index, mapping=es_mapping) dev_raw_data_source = builder.add_stage( 'Dev Raw Data Source').set_attributes(data_format='TEXT', stop_after_first_batch=True, raw_data=raw_str) error_target = builder.add_stage('To Error') dev_raw_data_source >> error_target es_error_pipeline = builder.build( title='ES error pipeline').configure_for_environment(elasticsearch) sdc_executor.add_pipeline(es_error_pipeline) try: elasticsearch.connect() # Make sure that the index exists properly before running the test index = Index(es_index) index.create() assert index.refresh() # Run pipeline and read from Elasticsearch to assert sdc_executor.start_pipeline(es_error_pipeline).wait_for_finished() # Since we are upsert on the same index, map, doc - there should only be one document (index 0) es_search = ESSearch(index=es_index) es_response = _es_search_with_retry(es_search) es_meta = es_response[0].meta # assert meta ingest assert es_meta['index'] == es_index and es_meta[ 'doc_type'] == es_mapping and es_meta['id'] == es_doc_id # assert data ingest assert raw_str == es_response[0].text finally: # Clean up test data in ES idx = Index(es_index) idx.delete()
def test_delete(write_client): write_client.indices.create( index='test-index', body={'settings': {'number_of_replicas': 0, 'number_of_shards': 1}} ) i = Index('test-index', using=write_client) i.delete() assert not write_client.indices.exists(index='test-index')
def drop_index(silent=True): """Remove the ElasticSearch index. """ index = Index(elasticsearch_config['index']) try: index.delete() except Exception as exc: if not silent: raise exc
def run(self, *args, **options): self.confirm( u"Are you really sure you want to delete the index '{0}' ?" .format(self.index_name) ) index = Index(self.index_name) if not self.dry_run: index.delete() self.print_success(u"Index {0} deleted.".format(self.index_name))
def recreate_index(self): """ Delete and then create a given index and set a default mapping. :param index: [string] name of the index. If None a default is used """ submission = Index(self.index) submission.delete(ignore=404) ESSubmission.init()
def initialize_index(self, delete_if_exists=False): """ Initialize index with mapping in ElasticSearch :param delete_if_exists: delete index, if exists :return: None """ def update_index_settings(): """ Function updates settings for slovenian lemmatization of words. As far as we know, elasticsearch-dsl library does not support custom filter settings. :return: None """ analysis_settings = { "analysis": { "filter": { "lemmagen_filter_sl": { "type": "lemmagen", "lexicon": "sl" } }, "analyzer": { "lemmagen_sl": { "type": "custom", "tokenizer": "uax_url_email", "filter": [ "lemmagen_filter_sl", "lowercase" ] } } } } self.client.cluster.health(index=self.index_name, wait_for_status='green', request_timeout=2) self.client.indices.close(index=self.index_name) self.client.indices.put_settings(json.dumps(analysis_settings), index=self.index_name) self.client.indices.open(index=self.index_name) index = Index(self.index_name, using=self.client) if delete_if_exists and index.exists(): index.delete() index.settings( # use higher number in production number_of_replicas=0 ) # register models index.doc_type(Document) index.create() update_index_settings() # set lemmanizer
def test_create_index_manually(self): out = io.StringIO() index_name = 'test_manually_created_index' call_command('create_index', index_name, stdout=out) self.assertIn("Created search index '{}'".format(index_name), out.getvalue()) index = Index(index_name) self.assertTrue(index.exists()) index.delete() self.assertFalse(index.exists())
def create_search_index(index_name, doc_types=None, connection='default', delete_if_exists=False): index = Index(index_name, using=connection) if delete_if_exists: index.delete(ignore=404) if doc_types: for dt in doc_types: if isinstance(dt, str): dt = get_document_class(dt) index.doc_type(dt) if not index.exists(): index.create() return index
def test_create_index_usings_settings(self): out = io.StringIO() call_command('create_index', stdout=out) self.assertIn("Creating search indices from settings", out.getvalue()) self.assertIn("Created search index '{}'".format(self.settings['default']['index']), out.getvalue()) index = Index(self.settings['default']['index']) self.assertTrue(index.exists()) index.delete() self.assertFalse(index.exists())
def create_indices(endpoint): """ Creates constituent and address indices in PIC """ connections.connections.create_connection(hosts=[endpoint], timeout=360, max_retries=10, retry_on_timeout=True) pic_index = Index('pic') pic_index.doc_type(Constituent) pic_index.doc_type(Address) pic_index.delete(ignore=404) pic_index.settings( number_of_shards=5, number_of_replicas=2 ) pic_index.create()
def recreate_index(): """Delete index if it's there and creates a new one""" index = Index(name=get_index_name(), using='default') for name, doc_type in get_doctypes().items(): index.doc_type(doc_type) # Delete the index if it exists. try: index.delete() except NotFoundError: pass # Note: There should be no mapping-conflict race here since the # index doesn't exist. Live indexing should just fail. # Create the index with the mappings all at once. index.create()
def build_search_index(cls, reset=True): if reset: index = ES_Index(cls._es_doctype._doc_type.index) index.delete(ignore=404) cls._es_doctype.init() def add_to_index(id_, db, app): with app.app_context(): obj = db.session.query(cls).get(id_) obj.add_to_search_index() app = db.get_app() with futures.ThreadPoolExecutor(max_workers=10) as executor: future_to_id = dict((executor.submit(add_to_index, id_, db, app), id_) for id_ in xrange(1, cls.count() + 1)) for future in futures.as_completed(future_to_id): id = future_to_id[future] if future.exception() is not None: print('%r generated an exception: %s' % ( id, future.exception()))
class SearchByFieldTestCase(TestCase): def setUp(self): self.es_conn = connections.get_connection() self.test_crecs = [] for i in range(20): self.test_crecs.append( CRECDoc( title=str(i), content='foo bar baz Foo', date_issued=datetime(2017, 1, i % 5 + 1) ) ) self.index = Index(settings.ES_CW_INDEX) CRECDoc.init() for c in self.test_crecs: c.save(refresh=True) self.client = Client() def tearDown(self): self.index.delete() def test_search_by_title(self): c = CRECDoc( title='foo', content='blah', date_issued=datetime(2017, 1, 1) ) c.save(refresh=True) start_date = datetime(2017, 1, 1) end_date = datetime(2017, 1, 30) query_args = { 'start_date': start_date.strftime('%Y-%m-%d'), 'end_date': end_date.strftime('%Y-%m-%d'), 'title': 'foo', } response = self.client.get('/cwapi/search/', query_args) response_content = response.json() results = response_content['data'] self.assertEquals(1, len(results)) self.assertEquals('foo', results[0]['title']) self.assertEquals('blah', results[0]['content']) def test_search_by_content(self): c = CRECDoc( title='foo', content='blah', date_issued=datetime(2017, 1, 1) ) c.save(refresh=True) start_date = datetime(2017, 1, 1) end_date = datetime(2017, 1, 30) query_args = { 'start_date': start_date.strftime('%Y-%m-%d'), 'end_date': end_date.strftime('%Y-%m-%d'), 'content': 'blah', } response = self.client.get('/cwapi/search/', query_args) response_content = response.json() results = response_content['data'] self.assertEquals(1, len(results)) self.assertEquals('foo', results[0]['title']) self.assertEquals('blah', results[0]['content']) def test_date_filter(self): c = CRECDoc( title='should be in results', content='blah', date_issued=datetime(2017, 1, 1) ) c2 = CRECDoc( title='should NOT be in results', content='blah', date_issued=datetime(2016, 1, 1) ) c.save(refresh=True) c2.save(refresh=True) start_date = datetime(2017, 1, 1) end_date = datetime(2017, 1, 30) query_args = { 'start_date': start_date.strftime('%Y-%m-%d'), 'end_date': end_date.strftime('%Y-%m-%d'), 'content': 'blah', } response = self.client.get('/cwapi/search/', query_args) response_content = response.json() results = response_content['data'] self.assertEquals(1, len(results)) self.assertEquals('should be in results', results[0]['title']) def test_multi_field(self): c = CRECDoc( title='foo', content='bar', date_issued=datetime(2017, 1, 1) ) c2 = CRECDoc( title='foo', content='baz', date_issued=datetime(2016, 1, 1) ) c.save(refresh=True) c2.save(refresh=True) start_date = datetime(2017, 1, 1) end_date = datetime(2017, 1, 30) query_args = { 'start_date': start_date.strftime('%Y-%m-%d'), 'end_date': end_date.strftime('%Y-%m-%d'), 'content': 'bar', 'title': 'foo', } response = self.client.get('/cwapi/search/', query_args) response_content = response.json() results = response_content['data'] self.assertEquals(1, len(results)) def test_pagination(self): start_date = datetime(2017, 1, 1) end_date = datetime(2017, 1, 30) query_args = { 'start_date': start_date.strftime('%Y-%m-%d'), 'end_date': end_date.strftime('%Y-%m-%d'), 'content': 'foo', } response = self.client.get('/cwapi/search/', query_args) response_content = response.json() max_score1 = max([d['score'] for d in response_content['data']]) self.assertIsNotNone(max_score1) self.assertEquals(10, len(response_content['data'])) query_args['offset'] = 10 response = self.client.get('/cwapi/search/', query_args) response_content = response.json() max_score2 = max([d['score'] for d in response_content['data']]) self.assertIsNotNone(max_score2) self.assertTrue(max_score1 >= max_score2) self.assertEquals(10, len(response_content['data']))
#!/usr/bin/env python from datetime import date, timedelta from elasticsearch import Elasticsearch from elasticsearch_dsl import Index, DocType, String # elasticsearch client = Elasticsearch(['192.168.33.108:9200','192.168.33.109:9200']) # date for x in range(3,5): ddate = date.today() - timedelta(x) ddate_str = ddate.strftime('%Y.%m.%d') idx = Index("logstash-%s" % ddate_str,using=client) idx.delete(ignore=404) print("%s is deleted" % ddate_str)
class TestMixins(BaseTestCase): def setUp(self): super(TestMixins, self).setUp() self.doc_type = Token.get_es_doc_type() self.index = Index(self.doc_type._doc_type.index) self.index.doc_type(self.doc_type) self.index.create() self.refresh() def tearDown(self): super(TestMixins, self).tearDown() self.index.delete() def test_is_indexable(self): self.assertTrue(ESIndexableMixin().is_indexable()) def test_is_index_update_needed(self): self.assertTrue(ESIndexableMixin().is_index_update_needed()) def test_get_indexable_queryset(self): self.assertEqual( str(Token.get_indexable_queryset().query), str(Token.objects.all().query) ) def test_get_es_doc(self): token = Token(name="token") self.assertIsNone(token.get_es_doc()) token.save() self.assertIsNotNone(token.get_es_doc()) def test_auto_doc_type_mapping(self): person = Person(first_name="Simion", last_name="Baws") person.save() doc_type = person.get_es_doc_mapping() self.assertEqual(doc_type.first_name, person.first_name) self.assertEqual(doc_type.last_name, person.last_name) self.assertEqual( doc_type.full_name, u"{0} {1}".format(person.first_name, person.last_name) ) def test_es_index(self): # Asynchronous call. token = Token.objects.create(name='not_indexable') self.assertDocDoesntExist(token) token.es_index() self.assertDocExists(token) # Synchronous call. token = Token.objects.create(name='not_indexable') self.assertDocDoesntExist(token) token.es_index(async=False) self.assertDocExists(token) # Fail silently. settings.TRAMPOLINE['OPTIONS']['disabled'] = True token = Token.objects.create(name='raise_exception') settings.TRAMPOLINE['OPTIONS']['disabled'] = False token.es_index() self.assertDocDoesntExist(token) # Hard fail. settings.TRAMPOLINE['OPTIONS']['fail_silently'] = False with self.assertRaises(RuntimeError): token.es_index() settings.TRAMPOLINE['OPTIONS']['fail_silently'] = True def test_es_delete(self): # Asynchronous call. token = Token.objects.create(name='token') self.assertDocExists(token) token.es_delete() self.assertDocDoesntExist(Token, token.pk) # Synchronous call. token = Token.objects.create(name='token') self.assertDocExists(token) token.es_delete(async=False) self.assertDocDoesntExist(Token, token.pk) # Fail silently if document doesn't exist. token.es_delete() from trampoline import get_trampoline_config trampoline_config = get_trampoline_config() # Fake delete to raise exception. backup_delete = trampoline_config.connection.delete def delete_raise_exception(*args, **kwargs): raise RuntimeError trampoline_config.connection.delete = delete_raise_exception # Fail silently token.es_delete() # Hard fail. settings.TRAMPOLINE['OPTIONS']['fail_silently'] = False with self.assertRaises(RuntimeError): token.es_delete() settings.TRAMPOLINE['OPTIONS']['fail_silently'] = True trampoline_config.connection.delete = backup_delete def test_save(self): token = Token(name='token') settings.TRAMPOLINE['OPTIONS']['disabled'] = True token.save() settings.TRAMPOLINE['OPTIONS']['disabled'] = False self.assertDocDoesntExist(token) token.save() doc = token.get_es_doc() self.assertEqual(doc.name, 'token') self.assertEqual(doc._id, str(token.pk)) # Update model and synchronise doc. token.name = 'kento' token.save() doc = token.get_es_doc() self.assertEqual(doc.name, 'kento') # Instance is not indexable. token = Token.objects.create(name='not_indexable') self.assertDocDoesntExist(token) def test_delete(self): token = Token.objects.create(name='token') token_id = token.pk self.assertDocExists(token) settings.TRAMPOLINE['OPTIONS']['disabled'] = True token.delete() settings.TRAMPOLINE['OPTIONS']['disabled'] = False self.assertDocExists(Token, token_id) token.save() token_id = token.pk token.delete() self.assertDocDoesntExist(Token, token_id)
class ElasticSearchIndex: def __init__(self, name='qb', similarity='default', bm25_b=None, bm25_k1=None): self.name = name self.ix = Index(self.name) self.answer_doc = create_doctype(self.name, similarity) if bm25_b is None: bm25_b = .75 if bm25_k1 is None: bm25_k1 = 1.2 self.bm25_b = bm25_b self.bm25_k1 = bm25_k1 def delete(self): try: self.ix.delete() except elasticsearch.exceptions.NotFoundError: log.info('Could not delete non-existent index.') def exists(self): return self.ix.exists() def init(self): self.ix.create() self.ix.close() self.ix.put_settings(body={'similarity': { 'qb_bm25': {'type': 'BM25', 'b': self.bm25_b, 'k1': self.bm25_k1}} }) self.ix.open() self.answer_doc.init(index=self.name) def build_large_docs(self, documents: Dict[str, str], use_wiki=True, use_qb=True, rebuild_index=False): if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))): log.info(f'Deleting index: {self.name}') self.delete() if self.exists(): log.info(f'Index {self.name} exists') else: log.info(f'Index {self.name} does not exist') self.init() wiki_lookup = Wikipedia() log.info('Indexing questions and corresponding wikipedia pages as large docs...') for page in tqdm.tqdm(documents): if use_wiki and page in wiki_lookup: wiki_content = wiki_lookup[page].text else: wiki_content = '' if use_qb: qb_content = documents[page] else: qb_content = '' answer = self.answer_doc( page=page, wiki_content=wiki_content, qb_content=qb_content ) answer.save(index=self.name) def build_many_docs(self, pages, documents, use_wiki=True, use_qb=True, rebuild_index=False): if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))): log.info(f'Deleting index: {self.name}') self.delete() if self.exists(): log.info(f'Index {self.name} exists') else: log.info(f'Index {self.name} does not exist') self.init() log.info('Indexing questions and corresponding pages as many docs...') if use_qb: log.info('Indexing questions...') for page, doc in tqdm.tqdm(documents): self.answer_doc(page=page, qb_content=doc).save() if use_wiki: log.info('Indexing wikipedia...') wiki_lookup = Wikipedia() for page in tqdm.tqdm(pages): if page in wiki_lookup: content = word_tokenize(wiki_lookup[page].text) for i in range(0, len(content), 200): chunked_content = content[i:i + 200] if len(chunked_content) > 0: self.answer_doc(page=page, wiki_content=' '.join(chunked_content)).save() def search(self, text: str, max_n_guesses: int, normalize_score_by_length=False, wiki_boost=1, qb_boost=1): if not self.exists(): raise ValueError('The index does not exist, you must create it before searching') if wiki_boost != 1: wiki_field = 'wiki_content^{}'.format(wiki_boost) else: wiki_field = 'wiki_content' if qb_boost != 1: qb_field = 'qb_content^{}'.format(qb_boost) else: qb_field = 'qb_content' s = Search(index=self.name)[0:max_n_guesses].query( 'multi_match', query=text, fields=[wiki_field, qb_field] ) results = s.execute() guess_set = set() guesses = [] if normalize_score_by_length: query_length = len(text.split()) else: query_length = 1 for r in results: if r.page in guess_set: continue else: guesses.append((r.page, r.meta.score / query_length)) return guesses
def handle(self, *args, **options): from searching.utils import autodiscover for _class in autodiscover(): index = Index(_class.get_model_index().Meta.index) index.delete()
class CountTermsTestCase(TestCase): def setUp(self): self.es_conn = connections.get_connection() self.test_crecs = [] for i in range(20): self.test_crecs.append( CRECDoc( title=str(i), content='foo bar baz Foo', date_issued=datetime(2017, 1, i % 5 + 1) ) ) self.index = Index(settings.ES_CW_INDEX) CRECDoc.init() for c in self.test_crecs: c.save(refresh=True) self.client = Client() def tearDown(self): self.index.delete() def test_num_docs_found(self): start_date = datetime(2017, 1, 1) end_date = datetime(2017, 1, 1) results = get_term_count_in_doc( self.es_conn, 'foo', start_date, end_date ) buckets = get_term_count_agg(results) self.assertIsNotNone(buckets) self.assertEquals(len(buckets), 1) count = buckets[0].get('term_counts', {}).get('value') self.assertEquals(count, 8) def test_bucketing(self): start_date = datetime(2017, 1, 1) end_date = datetime(2017, 1, 30) results = get_term_count_in_doc( self.es_conn, 'foo', start_date, end_date ) buckets = get_term_count_agg(results) self.assertIsNotNone(buckets) self.assertEquals(len(buckets), 5) for b in buckets: count = b.get('term_counts', {}).get('value') self.assertEquals(count, 8) def test_case_sensitivity(self): start_date = datetime(2017, 1, 1) end_date = datetime(2017, 1, 1) results = get_term_count_in_doc( self.es_conn, 'FOO', start_date, end_date ) buckets = get_term_count_agg(results) self.assertIsNotNone(buckets) self.assertEquals(len(buckets), 1) count = buckets[0].get('term_counts', {}).get('value') self.assertEquals(count, 8) def test_api_start_end_specified(self): start_date = datetime(2017, 1, 1) end_date = datetime(2017, 1, 31) query_args = { 'start_date': start_date.strftime('%Y-%m-%d'), 'end_date': end_date.strftime('%Y-%m-%d'), 'term': 'foo', } response = self.client.get('/cwapi/term_counts_by_day/', query_args) self.assertEquals(200, response.status_code) response_content = response.json() self.assertEqual('success', response_content['status']) self.assertEquals(31, len(response_content['data']['daily_counts'])) total = 0 for date_str, count in response_content['data']['daily_counts'].items(): dt = datetime.strptime(date_str, '%Y-%m-%d') self.assertTrue(dt >= start_date and dt <= end_date) if dt.day > 5: self.assertEquals(0, count) else: self.assertEquals(8, count) total += count self.assertEquals(40, total) @freeze_time('2017-01-31') def test_api_days_ago(self): query_args = { 'days_ago': 30, 'term': 'foo', } response = self.client.get('/cwapi/term_counts_by_day/', query_args) self.assertEquals(200, response.status_code) response_content = response.json() self.assertEqual('success', response_content['status']) self.assertEquals(31, len(response_content['data']['daily_counts'])) for date_str in response_content['data']['daily_counts'].keys(): dt = datetime.strptime(date_str, '%Y-%m-%d') self.assertTrue( dt >= datetime(2017, 1, 1) and dt <= datetime(2017, 1, 31) )