示例#1
0
class ProcessSpiderData(Task):
    def run(self, spider_name):
        cities = []
        backup_source = []
        backup_created_date = None

        self.elastic = ES(settings.SEARCH_HOSTS, timeout=22.0, bulk_size=1500)
        java = JavaInterface()

        self.extractor = java.ArticleSentencesExtractor.INSTANCE
        self.logger = ProcessSpiderData.get_logger()

        spider = Data.objects.get(name=spider_name)
        source = spider.source

        if spider and len(source):
            backup_created_date = spider.created_date
            index_new = '%s_%d' % (spider.name, int(time.time()))

            # create new index (not connected to alias)
            self.elastic.create_index(index_new)
            self.elastic.put_mapping('job', {'job':{'properties':mapping}}, index_new)

            for item in source:
                item = self._process_content(item)
                item = self._get_location(item)

                if item.has_key('city'):
                    cities.append(item['city'])

                self._create_index(index_new, item)
                backup_source.append(item)

            # save new index (in bulk)
            self.elastic.force_bulk()

            # create alias
            indices_old = self.elastic.get_alias(spider.name)
            self.elastic.set_alias(spider.name, [index_new])

            # delete all indices
            for index in indices_old:
                self.elastic.delete_index_if_exists(index)

            # optimize
            self.elastic.optimize(index_new, refresh=True)

        # save backup (currently processed data)
        if len(backup_source) and backup_created_date:
            self._process_cities(set(cities), spider_name)
            cache.clear()

            obj = DataBackup.objects.get_or_create(
                name=spider_name,
                created_date=backup_created_date
            )

            obj[0].source = binascii.hexlify(bz2.compress(
                JSONEncoder().encode(backup_source)
            ))

            obj[0].save()

        # force java & ES garbage collection
        self.elastic.connection.close()
        del self.extractor
        del java

        return True

    def _process_content(self, item):
        if len(item['content']):
            item['content'] = self.extractor.getText(jpype.JString(item['content']))
        return item

    def _get_location(self, item):
        if not item.has_key('city'):
            return item

        try:
            geo = geocoders.GeoNames()
            places = geo.geocode(item['city'].encode('utf-8'), exactly_one=False)

            if places:
                place, (lat, lon) = places[0] if isinstance(places, list) else places
                if place: item['pin'] = {
                    'location': { 'lat': lat, 'lon': lon }
                 }
        except: pass
        return item

    def _create_index(self, index, item):
        id = item['id']
        del item['id']

        try:
            self.elastic.get(index, 'job', id)
        except ElasticSearchException:
            self.elastic.index(
                dumps(item, cls=DjangoJSONEncoder),
                index, 'job', id, bulk=True
            )

    def _process_cities(self, cities, spider_name):
        cities_current = City.objects.filter(indices__contains='"%s"' % spider_name)

        # save lists of saved cities
        cities_old_single = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) == 1 ]
        cities_old_multi = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) > 1 ]

        for city in cities:
            city = unicode(city.strip().lower())
            city = normalize_spaces.sub(' ', city)
            city = remove_braces.sub('', city)

            city_clean = [remove_none_chars.sub('', word) for word in city.split(' ')]
            city_clean = ' '.join(filter(None, city_clean))

            city, created = City.objects.get_or_create(name = city_clean[:255])

            if created:
                city.indices = [spider_name]
            else:
                city.indices.append(spider_name)
                city.indices = list(set(city.indices))

            city.save()

            if city.name in cities_old_single: cities_old_single.remove(city.name)
            if city.name in cities_old_multi: cities_old_multi.remove(city.name)

        # remove unlinked citie
        City.objects.filter(name__in=cities_old_single).delete()

        for item in City.objects.filter(name__in=cities_old_multi):
            if spider_name in item.indices:
                item.indices.remove(spider_name)
                item.save()
示例#2
0
class ESIndexerBase(object):
    ES_HOST = ES_HOST
    ES_INDEX_NAME = ES_INDEX_NAME
    ES_INDEX_TYPE = 'gene'

    def __init__(self):
        self.conn = ES(self.ES_HOST, default_indexes=[self.ES_INDEX_NAME],
        	           timeout=10.0)
        self.step = 10000

    def create_index(self):
        try:
            print self.conn.open_index(self.ES_INDEX_NAME)
        except IndexMissingException:
            print self.conn.create_index(self.ES_INDEX_NAME)

    def delete_index_type(self, index_type):
        '''Delete all indexes for a given index_type.'''
        index_name = self.ES_INDEX_NAME
#        index_type = self.ES_INDEX_TYPE
        #Check if index_type exists
        mapping = self.conn.get_mapping(index_type, index_name)
        if index_name not in mapping or index_type not in mapping[index_name]:
            print 'Error: index type "%s" does not exist in index "%s".' % (index_type, index_name)
            return
        path = '/%s/%s' % (index_name, index_type)
        if ask('Confirm to delete all data under "%s":' % path) == 'Y':
            return self.conn.delete_mapping(index_name, index_type)

    def index(self, doc, index_type, id=None):
        '''add a doc to the index. If id is not None, the existing doc will be
           updated.
        '''
#        index_type = self.ES_INDEX_TYPE
        return self.conn.index(doc, self.ES_INDEX_NAME, index_type, id=id)

    def delete_index(self, index_type, id):
        '''delete a doc from the index based on passed id.'''
#        index_type = self.ES_INDEX_TYPE
        return self.conn.delete(self.ES_INDEX_NAME, index_type, id)

    def optimize(self):
        return self.conn.optimize(self.ES_INDEX_NAME, wait_for_merge=True)

    def get_field_mapping(self):
        import dataload
        reload(dataload)
        dataload.register_sources()
        return dataload.get_mapping()

    def build_index(self, doc_d, update_mapping=False, bulk=True):
        index_name = self.ES_INDEX_NAME
        index_type = self.ES_INDEX_TYPE

        #Test if index exists
        try:
            print "Opening index...", self.conn.open_index(index_name)
        except NotFoundException:
            print 'Error: index "%s" does not exist. Create it first.' % index_name
            return -1

        try:
            cur_mapping = self.conn.get_mapping(index_type, index_name)
            empty_mapping = False
        except ElasticSearchException:
            #if no existing mapping available for index_type
            #force update_mapping to True
            empty_mapping = True
            update_mapping = True

#        empty_mapping = not cur_mapping[index_name].get(index_type, {})
#        if empty_mapping:
#            #if no existing mapping available for index_type
#            #force update_mapping to True
#            update_mapping = True

        if update_mapping:
            print "Updating mapping...",
            if not empty_mapping:
                print "\n\tRemoving existing mapping...",
                print self.conn.delete_mapping(index_name, index_type)
            _mapping = self.get_field_mapping()
            print self.conn.put_mapping(index_type,
                                   _mapping,
                                   [index_name])
        print "Building index..."
        t0 = time.time()
        for doc_id, doc in doc_d.items():
            self.conn.index(doc, index_name, index_type, doc_id, bulk=bulk)
        print self.conn.flush()
        print self.conn.refresh()
        print "Done[%s]" % timesofar(t0)

    def query(self, qs, fields='symbol,name', **kwargs):
        _q = StringQuery(qs)
        res = self.conn.search(_q, fields=fields, **kwargs)
        return res