def job_redirect(request, slug, source, job_id): if request.method == 'GET' and request.GET.has_key('redirect'): try: elastic = ES(settings.SEARCH_HOSTS) data = elastic.get(source, 'job', job_id) elastic.connection.close() return HttpResponseRedirect(data['_source']['details_url']) except NotFoundException: raise Http404 return direct_to_template(request, 'pages/redirect.html')
def job_redirect(request, slug, source, job_id): if request.method == "GET" and request.GET.has_key("redirect"): try: elastic = ES(settings.SEARCH_HOSTS) data = elastic.get(source, "job", job_id) elastic.connection.close() return HttpResponseRedirect(data["_source"]["details_url"]) except NotFoundException: raise Http404 return direct_to_template(request, "pages/redirect.html")
class KVStore(KVStoreBase): def __init__(self, *args, **kwargs): super(KVStore, self).__init__(*args, **kwargs) self.connection = ES(settings.THUMBNAIL_ELASTIC_SEARCH_SERVERS) def _get_raw(self, key): try: #import pdb; pdb.set_trace() value = self.connection.get(settings.THUMBNAIL_ELASTIC_SEARCH_INDEX, settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE, key) return value['_source']['value'] except: return None def _set_raw(self, key, value): ret = self.connection.index({"value": value}, settings.THUMBNAIL_ELASTIC_SEARCH_INDEX, settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE, key) return ret['ok'] def _delete_raw(self, *keys): rets = [] for key in keys: try: ret = self.connection.delete(settings.THUMBNAIL_ELASTIC_SEARCH_INDEX, settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE, key) rets.append(ret['ok']) except: rets.append(False) return rets def _find_keys_raw(self, prefix): search = Search(query=PrefixQuery("_id", prefix), size=1000, start=0, fields=[]) results = self.connection.search(search, indexes=[settings.THUMBNAIL_ELASTIC_SEARCH_INDEX,], doc_types=[settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE,]) return [hit['_id'] for hit in results['hits']['hits']]
class ProcessSpiderData(Task): def run(self, spider_name): cities = [] backup_source = [] backup_created_date = None self.elastic = ES(settings.SEARCH_HOSTS, timeout=22.0, bulk_size=1500) java = JavaInterface() self.extractor = java.ArticleSentencesExtractor.INSTANCE self.logger = ProcessSpiderData.get_logger() spider = Data.objects.get(name=spider_name) source = spider.source if spider and len(source): backup_created_date = spider.created_date index_new = '%s_%d' % (spider.name, int(time.time())) # create new index (not connected to alias) self.elastic.create_index(index_new) self.elastic.put_mapping('job', {'job':{'properties':mapping}}, index_new) for item in source: item = self._process_content(item) item = self._get_location(item) if item.has_key('city'): cities.append(item['city']) self._create_index(index_new, item) backup_source.append(item) # save new index (in bulk) self.elastic.force_bulk() # create alias indices_old = self.elastic.get_alias(spider.name) self.elastic.set_alias(spider.name, [index_new]) # delete all indices for index in indices_old: self.elastic.delete_index_if_exists(index) # optimize self.elastic.optimize(index_new, refresh=True) # save backup (currently processed data) if len(backup_source) and backup_created_date: self._process_cities(set(cities), spider_name) cache.clear() obj = DataBackup.objects.get_or_create( name=spider_name, created_date=backup_created_date ) obj[0].source = binascii.hexlify(bz2.compress( JSONEncoder().encode(backup_source) )) obj[0].save() # force java & ES garbage collection self.elastic.connection.close() del self.extractor del java return True def _process_content(self, item): if len(item['content']): item['content'] = self.extractor.getText(jpype.JString(item['content'])) return item def _get_location(self, item): if not item.has_key('city'): return item try: geo = geocoders.GeoNames() places = geo.geocode(item['city'].encode('utf-8'), exactly_one=False) if places: place, (lat, lon) = places[0] if isinstance(places, list) else places if place: item['pin'] = { 'location': { 'lat': lat, 'lon': lon } } except: pass return item def _create_index(self, index, item): id = item['id'] del item['id'] try: self.elastic.get(index, 'job', id) except ElasticSearchException: self.elastic.index( dumps(item, cls=DjangoJSONEncoder), index, 'job', id, bulk=True ) def _process_cities(self, cities, spider_name): cities_current = City.objects.filter(indices__contains='"%s"' % spider_name) # save lists of saved cities cities_old_single = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) == 1 ] cities_old_multi = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) > 1 ] for city in cities: city = unicode(city.strip().lower()) city = normalize_spaces.sub(' ', city) city = remove_braces.sub('', city) city_clean = [remove_none_chars.sub('', word) for word in city.split(' ')] city_clean = ' '.join(filter(None, city_clean)) city, created = City.objects.get_or_create(name = city_clean[:255]) if created: city.indices = [spider_name] else: city.indices.append(spider_name) city.indices = list(set(city.indices)) city.save() if city.name in cities_old_single: cities_old_single.remove(city.name) if city.name in cities_old_multi: cities_old_multi.remove(city.name) # remove unlinked citie City.objects.filter(name__in=cities_old_single).delete() for item in City.objects.filter(name__in=cities_old_multi): if spider_name in item.indices: item.indices.remove(spider_name) item.save()