def tearDown(self): self.log.warning("before tearDown es") self._unlink_es_cluster() self._stop_es_replication() if self.es_host != None: conn = ES(self.es_host + ":9200") conn.delete_index_if_exists("default") super(ElasticSearchSupport, self).tearDown() self.log.warning("after tearDown es")
def setUp(self): self.es_host = None self.es_cluster_name = None self._state = [] super(ElasticSearchSupport, self).setUp() self.es_host = self.input.param("es_host", "127.0.0.1") self.es_port = self.input.param("es_port", 9091) conn = ES(self.es_host + ":9200") if not self.input.param("skip_cleanup", True) or self.case_number == 1: conn.delete_index_if_exists("default") conn.create_index("default") self.log.warning("waiting for ES index to be ready to use") time.sleep(30) self._link_es_cluster() self._start_es_replication() self.log.warning("after setUp es")
def handle(self, *args, **kwargs): delete_all = kwargs.get('all') elastic = ES(settings.SEARCH_HOSTS) indices = [] if delete_all: indices.extend(elastic.get_indices(True)) indices.extend(elastic.get_closed_indices()) for index in indices: elastic.delete_index_if_exists(index) else: for source_name in args: indices_aliased = [index for index in elastic.get_alias(source_name) if index == source_name] elastic.delete_index_if_exists(source_name) if indices_aliased: elastic.delete_alias(source_name, indices_aliased) for index in indices_aliased: elastic.delete_index_if_exists(index) if len(indices) and len(args): elastic.connection.close() self.stdout.write("Successfully deleted indicies & aliases.\n") elastic.connection.close()
class SampleMaker(object): def __init__(self, name): log = open(name, "wb") self.log = log self.conn = ES(("http", "127.0.0.1", 9200), timeout=300.0, log_curl=True, dump_curl=log) self.index_name = "test-index" self.document_type = "test-type" self.conn.delete_index_if_exists(self.index_name) self.init_default_index() def init_default_index(self): from pyes.helpers import SettingsBuilder settings = SettingsBuilder() from pyes.mappings import DocumentObjectField from pyes.mappings import IntegerField from pyes.mappings import NestedObject from pyes.mappings import StringField, DateField, BooleanField, GeoPointField, FloatField docmapping = DocumentObjectField(name=self.document_type) docmapping.add_property( StringField(name="description", store=True, term_vector="with_positions_offsets", index="analyzed")) docmapping.add_property( StringField(name="name", store=True, term_vector="with_positions_offsets", index="analyzed")) docmapping.add_property(StringField(name="tag", store=True, index="not_analyzed")) docmapping.add_property(IntegerField(name="age", store=True)) docmapping.add_property(FloatField(name="price")) docmapping.add_property(DateField(name="date", store=True)) docmapping.add_property(BooleanField(name="in_stock", store=True, index="not_analyzed")) docmapping.add_property(GeoPointField(name="position")) nested_object = NestedObject(name="metadata") nested_object.add_property(StringField(name="name", store=True)) nested_object.add_property(StringField(name="value", store=True)) nested_object.add_property(IntegerField(name="num", store=True)) docmapping.add_property(nested_object) settings.add_mapping(docmapping) self.conn.ensure_index(self.index_name, settings) def generate_datafile(self, number_items=1000): """ Generate a dataset with number_items elements. """ names = get_names() totalnames = len(names) #init random seeder random.seed() #calculate items # names = random.sample(names, number_items) for i in xrange(number_items): data = {"name": names[random.randint(0, totalnames - 1)], "age": random.randint(1, 100), "price": random.random()*100.0, "tag":[words(1, False) for r in xrange(random.randint(1, 5))], "in_stock": random.choice([True, False]), "date": datetime.now()+timedelta(days=random.choice([1, -1])*random.randint(0,1000)), "position": { "lat" : random.choice([1, -1])* random.random()*90.0, "lon" : random.choice([1, -1])* random.random()*180.0 }, "description": words(random.randint(1, 100), False), "metadata":[{"name":names[random.randint(0, totalnames - 1)], "value":str(random.randint(1, 5)), "num":random.randint(1, 50) } for r in xrange(random.randint(1, 5))] } self.conn.index(data, self.index_name, self.document_type, id=str(i+1)) def close(self): self.conn.flush(self.index_name) self.log.close()
class ProcessSpiderData(Task): def run(self, spider_name): cities = [] backup_source = [] backup_created_date = None self.elastic = ES(settings.SEARCH_HOSTS, timeout=22.0, bulk_size=1500) java = JavaInterface() self.extractor = java.ArticleSentencesExtractor.INSTANCE self.logger = ProcessSpiderData.get_logger() spider = Data.objects.get(name=spider_name) source = spider.source if spider and len(source): backup_created_date = spider.created_date index_new = '%s_%d' % (spider.name, int(time.time())) # create new index (not connected to alias) self.elastic.create_index(index_new) self.elastic.put_mapping('job', {'job':{'properties':mapping}}, index_new) for item in source: item = self._process_content(item) item = self._get_location(item) if item.has_key('city'): cities.append(item['city']) self._create_index(index_new, item) backup_source.append(item) # save new index (in bulk) self.elastic.force_bulk() # create alias indices_old = self.elastic.get_alias(spider.name) self.elastic.set_alias(spider.name, [index_new]) # delete all indices for index in indices_old: self.elastic.delete_index_if_exists(index) # optimize self.elastic.optimize(index_new, refresh=True) # save backup (currently processed data) if len(backup_source) and backup_created_date: self._process_cities(set(cities), spider_name) cache.clear() obj = DataBackup.objects.get_or_create( name=spider_name, created_date=backup_created_date ) obj[0].source = binascii.hexlify(bz2.compress( JSONEncoder().encode(backup_source) )) obj[0].save() # force java & ES garbage collection self.elastic.connection.close() del self.extractor del java return True def _process_content(self, item): if len(item['content']): item['content'] = self.extractor.getText(jpype.JString(item['content'])) return item def _get_location(self, item): if not item.has_key('city'): return item try: geo = geocoders.GeoNames() places = geo.geocode(item['city'].encode('utf-8'), exactly_one=False) if places: place, (lat, lon) = places[0] if isinstance(places, list) else places if place: item['pin'] = { 'location': { 'lat': lat, 'lon': lon } } except: pass return item def _create_index(self, index, item): id = item['id'] del item['id'] try: self.elastic.get(index, 'job', id) except ElasticSearchException: self.elastic.index( dumps(item, cls=DjangoJSONEncoder), index, 'job', id, bulk=True ) def _process_cities(self, cities, spider_name): cities_current = City.objects.filter(indices__contains='"%s"' % spider_name) # save lists of saved cities cities_old_single = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) == 1 ] cities_old_multi = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) > 1 ] for city in cities: city = unicode(city.strip().lower()) city = normalize_spaces.sub(' ', city) city = remove_braces.sub('', city) city_clean = [remove_none_chars.sub('', word) for word in city.split(' ')] city_clean = ' '.join(filter(None, city_clean)) city, created = City.objects.get_or_create(name = city_clean[:255]) if created: city.indices = [spider_name] else: city.indices.append(spider_name) city.indices = list(set(city.indices)) city.save() if city.name in cities_old_single: cities_old_single.remove(city.name) if city.name in cities_old_multi: cities_old_multi.remove(city.name) # remove unlinked citie City.objects.filter(name__in=cities_old_single).delete() for item in City.objects.filter(name__in=cities_old_multi): if spider_name in item.indices: item.indices.remove(spider_name) item.save()
class SampleMaker(object): def __init__(self, name): log = open(name, "wb") self.log = log self.conn = ES(("http", "127.0.0.1", 9200), timeout=300.0, log_curl=True, dump_curl=log) self.index_name = "test-index" self.document_type = "test-type" self.conn.delete_index_if_exists(self.index_name) self.init_default_index() def init_default_index(self): from pyes.helpers import SettingsBuilder settings = SettingsBuilder() from pyes.mappings import DocumentObjectField from pyes.mappings import IntegerField from pyes.mappings import NestedObject from pyes.mappings import StringField, DateField, BooleanField, GeoPointField, FloatField docmapping = DocumentObjectField(name=self.document_type) docmapping.add_property( StringField(name="description", store=True, term_vector="with_positions_offsets", index="analyzed")) docmapping.add_property( StringField(name="name", store=True, term_vector="with_positions_offsets", index="analyzed")) docmapping.add_property( StringField(name="tag", store=True, index="not_analyzed")) docmapping.add_property(IntegerField(name="age", store=True)) docmapping.add_property(FloatField(name="price")) docmapping.add_property(DateField(name="date", store=True)) docmapping.add_property( BooleanField(name="in_stock", store=True, index="not_analyzed")) docmapping.add_property(GeoPointField(name="position")) nested_object = NestedObject(name="metadata") nested_object.add_property(StringField(name="name", store=True)) nested_object.add_property(StringField(name="value", store=True)) nested_object.add_property(IntegerField(name="num", store=True)) docmapping.add_property(nested_object) settings.add_mapping(docmapping) self.conn.ensure_index(self.index_name, settings) def generate_datafile(self, number_items=1000): """ Generate a dataset with number_items elements. """ names = get_names() totalnames = len(names) #init random seeder random.seed() #calculate items # names = random.sample(names, number_items) for i in xrange(number_items): data = { "name": names[random.randint(0, totalnames - 1)], "age": random.randint(1, 100), "price": random.random() * 100.0, "tag": [words(1, False) for r in xrange(random.randint(1, 5))], "in_stock": random.choice([True, False]), "date": datetime.now() + timedelta(days=random.choice([1, -1]) * random.randint(0, 1000)), "position": { "lat": random.choice([1, -1]) * random.random() * 90.0, "lon": random.choice([1, -1]) * random.random() * 180.0 }, "description": words(random.randint(1, 100), False), "metadata": [{ "name": names[random.randint(0, totalnames - 1)], "value": str(random.randint(1, 5)), "num": random.randint(1, 50) } for r in xrange(random.randint(1, 5))] } self.conn.index(data, self.index_name, self.document_type, id=str(i + 1)) def close(self): self.conn.flush(self.index_name) self.log.close()