示例#1
0
def job_redirect(request, slug, source, job_id):
    if request.method == 'GET' and request.GET.has_key('redirect'):
        try:
            elastic = ES(settings.SEARCH_HOSTS)
            data = elastic.get(source, 'job', job_id)
            elastic.connection.close()
            return HttpResponseRedirect(data['_source']['details_url'])
        except NotFoundException:
            raise Http404
        
    return direct_to_template(request, 'pages/redirect.html')
示例#2
0
def job_redirect(request, slug, source, job_id):
    if request.method == "GET" and request.GET.has_key("redirect"):
        try:
            elastic = ES(settings.SEARCH_HOSTS)
            data = elastic.get(source, "job", job_id)
            elastic.connection.close()
            return HttpResponseRedirect(data["_source"]["details_url"])
        except NotFoundException:
            raise Http404

    return direct_to_template(request, "pages/redirect.html")
class KVStore(KVStoreBase):
    def __init__(self, *args, **kwargs):
        super(KVStore, self).__init__(*args, **kwargs)
        self.connection = ES(settings.THUMBNAIL_ELASTIC_SEARCH_SERVERS)

    def _get_raw(self, key):
        try:
            #import pdb; pdb.set_trace()
            value = self.connection.get(settings.THUMBNAIL_ELASTIC_SEARCH_INDEX, 
                                        settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE,
                                        key)
            return value['_source']['value']
        except:
            return None

    def _set_raw(self, key, value):
        ret = self.connection.index({"value": value}, 
                                    settings.THUMBNAIL_ELASTIC_SEARCH_INDEX,
                                    settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE,
                                    key)
        return ret['ok']
    
    def _delete_raw(self, *keys):
        rets = []
        for key in keys:
            try:
                ret = self.connection.delete(settings.THUMBNAIL_ELASTIC_SEARCH_INDEX,
                                             settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE,
                                             key)
                rets.append(ret['ok'])
            except:
                rets.append(False)
        return rets

    def _find_keys_raw(self, prefix):
        search = Search(query=PrefixQuery("_id", prefix), size=1000, start=0, fields=[])
        results = self.connection.search(search, 
                                         indexes=[settings.THUMBNAIL_ELASTIC_SEARCH_INDEX,], 
                                         doc_types=[settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE,])
        return [hit['_id'] for hit in results['hits']['hits']]
示例#4
0
class ProcessSpiderData(Task):
    def run(self, spider_name):
        cities = []
        backup_source = []
        backup_created_date = None

        self.elastic = ES(settings.SEARCH_HOSTS, timeout=22.0, bulk_size=1500)
        java = JavaInterface()

        self.extractor = java.ArticleSentencesExtractor.INSTANCE
        self.logger = ProcessSpiderData.get_logger()

        spider = Data.objects.get(name=spider_name)
        source = spider.source

        if spider and len(source):
            backup_created_date = spider.created_date
            index_new = '%s_%d' % (spider.name, int(time.time()))

            # create new index (not connected to alias)
            self.elastic.create_index(index_new)
            self.elastic.put_mapping('job', {'job':{'properties':mapping}}, index_new)

            for item in source:
                item = self._process_content(item)
                item = self._get_location(item)

                if item.has_key('city'):
                    cities.append(item['city'])

                self._create_index(index_new, item)
                backup_source.append(item)

            # save new index (in bulk)
            self.elastic.force_bulk()

            # create alias
            indices_old = self.elastic.get_alias(spider.name)
            self.elastic.set_alias(spider.name, [index_new])

            # delete all indices
            for index in indices_old:
                self.elastic.delete_index_if_exists(index)

            # optimize
            self.elastic.optimize(index_new, refresh=True)

        # save backup (currently processed data)
        if len(backup_source) and backup_created_date:
            self._process_cities(set(cities), spider_name)
            cache.clear()

            obj = DataBackup.objects.get_or_create(
                name=spider_name,
                created_date=backup_created_date
            )

            obj[0].source = binascii.hexlify(bz2.compress(
                JSONEncoder().encode(backup_source)
            ))

            obj[0].save()

        # force java & ES garbage collection
        self.elastic.connection.close()
        del self.extractor
        del java

        return True

    def _process_content(self, item):
        if len(item['content']):
            item['content'] = self.extractor.getText(jpype.JString(item['content']))
        return item

    def _get_location(self, item):
        if not item.has_key('city'):
            return item

        try:
            geo = geocoders.GeoNames()
            places = geo.geocode(item['city'].encode('utf-8'), exactly_one=False)

            if places:
                place, (lat, lon) = places[0] if isinstance(places, list) else places
                if place: item['pin'] = {
                    'location': { 'lat': lat, 'lon': lon }
                 }
        except: pass
        return item

    def _create_index(self, index, item):
        id = item['id']
        del item['id']

        try:
            self.elastic.get(index, 'job', id)
        except ElasticSearchException:
            self.elastic.index(
                dumps(item, cls=DjangoJSONEncoder),
                index, 'job', id, bulk=True
            )

    def _process_cities(self, cities, spider_name):
        cities_current = City.objects.filter(indices__contains='"%s"' % spider_name)

        # save lists of saved cities
        cities_old_single = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) == 1 ]
        cities_old_multi = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) > 1 ]

        for city in cities:
            city = unicode(city.strip().lower())
            city = normalize_spaces.sub(' ', city)
            city = remove_braces.sub('', city)

            city_clean = [remove_none_chars.sub('', word) for word in city.split(' ')]
            city_clean = ' '.join(filter(None, city_clean))

            city, created = City.objects.get_or_create(name = city_clean[:255])

            if created:
                city.indices = [spider_name]
            else:
                city.indices.append(spider_name)
                city.indices = list(set(city.indices))

            city.save()

            if city.name in cities_old_single: cities_old_single.remove(city.name)
            if city.name in cities_old_multi: cities_old_multi.remove(city.name)

        # remove unlinked citie
        City.objects.filter(name__in=cities_old_single).delete()

        for item in City.objects.filter(name__in=cities_old_multi):
            if spider_name in item.indices:
                item.indices.remove(spider_name)
                item.save()