예제 #1
0
 def post_slice(self, start_row, stop_row, params, files):
     action = params['action']
     with thrift_lock() as thrift:
         manager = PicarusManager(thrift=thrift)
         if action == 'io/thumbnail':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.image_thumbnail(start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/exif':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.image_exif(start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/preprocess':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.image_preprocessor(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/classify':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.feature_to_prediction(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/feature':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.takeout_link_job(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/link':
             self._slice_validate(start_row, stop_row, 'rw')
             model_key = base64.urlsafe_b64decode(params['model'])
             chain_input, model_link = _takeout_model_link_from_key(manager, model_key)
             manager.takeout_chain_job([model_link], chain_input, model_key, start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/chain':
             self._slice_validate(start_row, stop_row, 'rw')
             model_key = base64.urlsafe_b64decode(params['model'])
             chain_inputs, model_chain = zip(*_takeout_model_chain_from_key(manager, model_key))
             manager.takeout_chain_job(list(model_chain), chain_inputs[0], model_key, start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/hash':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.feature_to_hash(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'i/dedupe/identical':
             self._slice_validate(start_row, stop_row, 'r')
             col = base64.urlsafe_b64decode(params['column'])
             features = {}
             dedupe_feature = lambda x, y: features.setdefault(base64.b64encode(hashlib.md5(y).digest()), []).append(base64.urlsafe_b64encode(x))
             for cur_row, cur_col in hadoopy_hbase.scanner_row_column(thrift, self.table, column=col,
                                                                      start_row=start_row, per_call=10,
                                                                      stop_row=stop_row):
                 dedupe_feature(cur_row, cur_col)
             bottle.response.headers["Content-type"] = "application/json"
             return json.dumps([{'rows': y} for x, y in features.items() if len(y) > 1])
         elif action == 'o/crawl/flickr':
             self._slice_validate(start_row, stop_row, 'w')
             # Only slices where the start_row can be used as a prefix may be used
             assert start_row and ord(start_row[-1]) != 255 and start_row[:-1] + chr(ord(start_row[-1]) + 1) == stop_row
             p = {}
             row_prefix = start_row
             assert row_prefix.find(':') != -1
             class_name = params['className']
             query = params.get('query')
             query = class_name if query is None else query
             p['lat'] = query = params.get('lat')
             p['lon'] = query = params.get('lon')
             p['radius'] = query = params.get('radius')
             p['api_key'] = params.get('apiKey', FLICKR_API_KEY)
             p['api_secret'] = params.get('apiSecret', FLICKR_API_SECRET)
             if 'hasGeo' in params:
                 p['has_geo'] = params['hasGeo'] == '1'
             try:
                 p['min_upload_date'] = int(params['minUploadDate'])
             except KeyError:
                 pass
             try:
                 p['max_upload_date'] = int(params['maxUploadDate'])
             except KeyError:
                 pass
             try:
                 p['page'] = int(params['page'])
             except KeyError:
                 pass
             return {'numRows': crawlers.flickr_crawl(crawlers.HBaseCrawlerStore(thrift, row_prefix), class_name, query, **p)}
         elif action in ('io/annotate/image/query', 'io/annotate/image/entity', 'io/annotate/image/query_batch'):
             self._slice_validate(start_row, stop_row, 'r')
             secret = base64.urlsafe_b64encode(uuid.uuid4().bytes)[:-2]
             task = base64.urlsafe_b64encode(uuid.uuid4().bytes)[:-2]
             p = {}
             image_column = base64.urlsafe_b64decode(params['imageColumn'])
             if action == 'io/annotate/image/entity':
                 entity_column = base64.urlsafe_b64decode(params['entityColumn'])
                 data = 'hbase://localhost:9090/images/%s/%s?entity=%s&image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row),
                                                                                    entity_column, image_column)
                 p['type'] = 'image_entity'
             elif action == 'io/annotate/image/query':
                 query = params['query']
                 data = 'hbase://localhost:9090/images/%s/%s?image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row), image_column)
                 p['type'] = 'image_query'
                 p['query'] = query
             elif action == 'io/annotate/image/query_batch':
                 query = params['query']
                 data = 'hbase://localhost:9090/images/%s/%s?image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row), image_column)
                 p['type'] = 'image_query_batch'
                 p['query'] = query
             else:
                 bottle.abort(400)
             p['num_tasks'] = 100
             p['mode'] = 'standalone'
             try:
                 redis_host, redis_port = ANNOTATORS.add_task(task, self.owner, secret, data, p).split(':')
             except annotators.CapacityException:
                 bottle.abort(503)
             p['setup'] = True
             p['reset'] = True
             p['secret'] = secret
             p['redis_address'] = redis_host
             p['redis_port'] = int(redis_port)
             mturk_vision.manager(data=data, **p)
             return {'task': task}
         else:
             bottle.abort(400)
예제 #2
0
    def flickr_job(self, params, start_row, stop_row, job_row):
        # Only slices where the start_row can be used as a prefix may be used
        assert start_row and ord(
            start_row[-1]
        ) != 255 and start_row[:-1] + chr(ord(start_row[-1]) + 1) == stop_row
        p = {}
        row_prefix = start_row
        p['class_name'] = params.get('className')
        p['query'] = params.get('query')
        p['lat'] = params.get('lat')
        p['lon'] = params.get('lon')
        if not ((p['lat'] and p['lon']) or p['query']):
            bottle.abort(400, 'Invalid crawler parameters')
        p['radius'] = params.get('radius')
        p['size'] = params.get('size', 'm')
        if p['size'] not in ('sq', 't', 's', 'q', 'm', 'n', 'z', 'c', 'l',
                             'o'):
            bottle.abort(400, 'Invalid size')
        p['api_key'] = params.get('apiKey', FLICKR_API_KEY)
        p['api_secret'] = params.get('apiSecret', FLICKR_API_SECRET)
        if not p['api_key'] or not p['api_secret']:
            bottle.abort(
                400
            )  # Either we don't have a default or the user provided an empty key
        if 'hasGeo' in params:
            p['has_geo'] = params['hasGeo'] == '1'
        if 'onePerOwner' in params:
            p['one_per_owner'] = params['onePerOwner'] == '1'
        try:
            p['max_rows'] = int(params['maxRows'])
        except KeyError:
            pass
        iterations = min(10000, int(params.get('iterations', 1)))
        try:
            min_upload_date = int(params['minUploadDate'])
        except KeyError:
            min_upload_date = None
        try:
            max_upload_date = int(params['maxUploadDate'])
        except KeyError:
            max_upload_date = None
        try:
            upload_date_radius = int(params['uploadDateRadius'])
            if max_upload_date is None:
                max_upload_date = int(time.time())
            if min_upload_date is None:
                min_upload_date = 1232170610
        except KeyError:
            upload_date_radius = None
            if max_upload_date is not None:
                p['max_upload_date'] = max_upload_date
            if min_upload_date is not None:
                p['min_upload_date'] = min_upload_date
        if iterations > 1 and upload_date_radius is None:
            bottle.abort(
                400
            )  # Need to specify a radius for multiple iters or else we will return same vals
        try:
            p['page'] = int(params['page'])
        except KeyError:
            pass
        job_columns = {'goodRows': 0, 'badRows': 0, 'status': 'running'}

        def store(crawl_kwargs, image, source, **kw):
            print('In store')
            # TODO: Need to extend onePerOwner to multiple iterations
            query = crawl_kwargs.get('query')
            class_name = crawl_kwargs.get('class_name')
            cols = {}
            md5 = lambda x: hashlib.md5(x).digest()
            cur_md5 = md5(image)
            cols['data:image'] = image
            if class_name is not None:
                cols['meta:class'] = class_name
            if query is not None:
                cols['meta:query'] = query
            cols['meta:source'] = source
            cols['hash:md5'] = cur_md5
            for x, y in kw.items():
                cols['meta:' + x] = y
            row = row_prefix + cur_md5
            print('Pre mutate row')
            self.mutate_row('images', row, cols)
            print('Post mutate row')
            job_columns['goodRows'] += 1
            self._jobs.update_task(job_row, job_columns)

        for n in range(iterations):
            print('Iter[%d]' % n)
            if upload_date_radius:
                p['min_upload_date'] = random.randint(
                    min_upload_date, max_upload_date - upload_date_radius)
                p['max_upload_date'] = p['min_upload_date'] + upload_date_radius
            crawlers.flickr_crawl(store, **p)
        job_columns['status'] = 'completed'
        self._jobs.update_task(job_row, job_columns)
예제 #3
0
파일: tables.py 프로젝트: sanchan/picarus
 def post_slice(self, start_row, stop_row, params, files):
     if files:
         bottle.abort(400)
     params = {k: base64.b64decode(v) for k, v in params.items()}
     action = params['action']
     with thrift_new() as thrift:
         manager = PicarusManager(thrift=thrift)
         if action == 'io/thumbnail':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.image_thumbnail(start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/exif':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.image_exif(start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/link':
             self._slice_validate(start_row, stop_row, 'rw')
             model_key = params['model']
             chain_input, model_link = _takeout_input_model_link_from_key(manager, model_key)
             manager.takeout_chain_job([model_link], chain_input, model_key, start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/chain':
             self._slice_validate(start_row, stop_row, 'rw')
             model_key = params['model']
             chain_inputs, model_chain = zip(*_takeout_input_model_chain_from_key(manager, model_key))
             manager.takeout_chain_job(list(model_chain), chain_inputs[0], model_key, start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'i/faces':
             # TODO: Temporary, remove when done
             names = set(['George_W_Bush', 'Colin_Powell', 'Tony_Blair', 'Donald_Rumsfeld', 'Gerhard_Schroeder',
                          'Ariel_Sharon', 'Hugo_Chavez', 'Junichiro_Koizumi', 'Serena_Williams', 'John_Ashcroft'])
             self._slice_validate(start_row, stop_row, 'r')
             import cv2
             r = None
             labels = {}
             pos = 0
             neg = 0
             data = []
             lab = []
             num_train = 2000
             for n, (cur_row, cur_cols) in enumerate(hadoopy_hbase.scanner(thrift, self.table,
                                                                           start_row=start_row, per_call=10,
                                                                           stop_row=stop_row, columns=['data:image', 'meta:class'])):
                 cur_class = cur_cols['meta:class']
                 if cur_class not in names:
                     continue
                 if cur_class not in labels:
                     labels[cur_class] = len(labels)
                 label = labels[cur_class]
                 image = cv2.imdecode(np.fromstring(cur_cols['data:image'], np.uint8), 0)
                 # Crop
                 image = np.ascontiguousarray(image[62:-62, 62:-62])
                 #if n == 0:
                 #    cv2.imwrite('out.png', image)
                 if n < num_train:
                     lab.append(label)
                     data.append(image)
                 else:
                     if r is None:
                         r = cv2.createLBPHFaceRecognizer()
                         r.train(data, np.array(lab))
                         print('TRAINED-----------------------')
                     pred = r.predict(image)[0]
                     print((pred, label))
                     if pred == label:
                         pos += 1
                     else:
                         neg += 1
                 print((cur_class, image.shape, n, pos, neg, pos / float(pos + neg + .00000001)))
         elif action == 'io/garbage':
             self._slice_validate(start_row, stop_row, 'rw')
             columns_removed = set()
             columns_kept = set()
             # TODO: Get all user models and save those too
             active_models = set()
             for cur_row, cur_cols in hadoopy_hbase.scanner(thrift, self.table, filter='KeyOnlyFilter()',
                                                            start_row=start_row, per_call=10,
                                                            stop_row=stop_row):
                 for k in cur_cols.keys():
                     if not (k.startswith('meta:') or k.startswith('thum:') or k == 'data:image' or k in active_models):
                         if k not in columns_removed:
                             columns_removed.add(k)
                             print(columns_removed)
                             print(len(columns_removed))
                     else:
                         if k not in columns_kept:
                             columns_kept.add(k)
                             print(columns_kept)
                             print(len(columns_kept))
             return {'columnsRemoved': list(columns_removed), 'columnsKept': list(columns_kept)}
         elif action == 'i/dedupe/identical':
             self._slice_validate(start_row, stop_row, 'r')
             col = params['column']
             features = {}
             dedupe_feature = lambda x, y: features.setdefault(base64.b64encode(hashlib.md5(y).digest()), []).append(base64.b64encode(x))
             for cur_row, cur_col in hadoopy_hbase.scanner_row_column(thrift, self.table, column=col,
                                                                      start_row=start_row, per_call=10,
                                                                      stop_row=stop_row):
                 dedupe_feature(cur_row, cur_col)
             bottle.response.headers["Content-type"] = "application/json"
             return json.dumps([{'rows': y} for x, y in features.items() if len(y) > 1])
         elif action == 'o/crawl/flickr':
             self._slice_validate(start_row, stop_row, 'w')
             # Only slices where the start_row can be used as a prefix may be used
             assert start_row and ord(start_row[-1]) != 255 and start_row[:-1] + chr(ord(start_row[-1]) + 1) == stop_row
             p = {}
             row_prefix = start_row
             assert row_prefix.find(':') != -1
             print('params[%r]' % params)
             class_name = params.get('className')
             query = params['query']
             p['lat'] = params.get('lat')
             p['lon'] = params.get('lon')
             p['radius'] = params.get('radius')
             p['api_key'] = params.get('apiKey', FLICKR_API_KEY)
             p['api_secret'] = params.get('apiSecret', FLICKR_API_SECRET)
             if not p['api_key'] or not p['api_secret']:
                 bottle.abort(400)  # Either we don't have a default or the user provided an empty key
             if 'hasGeo' in params:
                 p['has_geo'] = params['hasGeo'] == '1'
             if 'onePerOwner' in params:
                 p['one_per_owner'] = params['onePerOwner'] == '1'
             try:
                 p['min_upload_date'] = int(params['minUploadDate'])
             except KeyError:
                 pass
             try:
                 p['max_rows'] = int(params['maxRows'])
             except KeyError:
                 pass
             try:
                 p['max_upload_date'] = int(params['maxUploadDate'])
             except KeyError:
                 pass
             try:
                 p['page'] = int(params['page'])
             except KeyError:
                 pass
             return {'numRows': crawlers.flickr_crawl(crawlers.HBaseCrawlerStore(thrift, row_prefix), class_name=class_name, query=query, **p)}
         else:
             bottle.abort(400)