예제 #1
0
파일: tables.py 프로젝트: sanchan/picarus
 def post_table(self, params, files):
     if files:
         bottle.abort(400)
     params = {base64.b64decode(k): base64.b64decode(v) for k, v in params.items()}
     path = params['path']
     start_stop_rows = parse_slices()
     if path in ('images/class',):
         data_table = get_table(self._auth_user, path.split('/', 1)[0])
         for start_row, stop_row in start_stop_rows:
             data_table._slice_validate(start_row, stop_row, 'r')
         # We never need to decode these, they just need to be
         # random strings that can be in a url
         secret = base64.urlsafe_b64encode(uuid.uuid4().bytes)[:-2]
         task = base64.urlsafe_b64encode(uuid.uuid4().bytes)[:-2]
         p = {}
         image_column = params['imageColumn']
         ub64 = base64.urlsafe_b64encode
         if path == 'images/class':
             class_column = params['classColumn']
             assert class_column.startswith('meta:')
             suffix = '/'.join(ub64(x) + '/' + ub64(y) for x, y in start_stop_rows)
             data = 'hbase://localhost:9090/images/%s?class=%s&image=%s' % (suffix,
                                                                            ub64(class_column), ub64(image_column))
             p['type'] = 'image_class'
             try:
                 p['class_descriptions'] = params['classDescriptions']
             except KeyError:
                 pass
             try:
                 p['class_thumbnails'] = params['classThumbnails']
             except KeyError:
                 pass
         else:
             bottle.abort(400)
         if 'instructions' in params:
             p['instructions'] = params['instructions']
         p['num_tasks'] = int(params['numTasks'])
         assert 0 < p['num_tasks']
         assert params['mode'] in ('standalone', 'amt')
         p['mode'] = params['mode']
         p['task_key'] = task
         redis_host, redis_port = ANNOTATORS.add_task(task, self.owner, secret, data, p)
         p['sync'] = True
         p['secret'] = secret
         p['redis_address'] = redis_host
         p['redis_port'] = int(redis_port)
         mturk_vision.manager(data=data, **p)
         return {'task': base64.b64encode(task)}
     else:
         bottle.abort(400)
예제 #2
0
 def get_manager(self, task):
     self.exists(task)
     try:
         return self.cache[task]
     except KeyError:
         data = self.db.hgetall(task + ':annot')
         p = json.loads(data['params'])
         p['sync'] = False
         p['secret'] = data['_secret']
         p['redis_address'] = self.redis_host
         p['redis_port'] = int(self.redis_port)
         self.cache[task] = mturk_vision.manager(data=data['_data'], **p)
         return self.cache[task]
예제 #3
0
파일: jobs.py 프로젝트: William-Wai/picarus
 def get_annotation_manager(self, task, data_connection, sync=False):
     self._exists(task)
     self._check_type(task, 'annotation')
     data = self.db.hgetall(self._task_prefix + task)
     p = json.loads(data['params'])
     ps = json.loads(data['_params'])
     p['sync'] = sync
     p['secret'] = str(ps['secret'])
     p['redis_address'] = self.annotation_redis_host
     p['redis_port'] = int(self.annotation_redis_port)
     p['task_key'] = task
     # TODO: Currently only compatible with thrift based datastores
     if data_connection:
         data_connection = data_connection._thrift
     return mturk_vision.manager(data=str(ps['data']), data_connection=data_connection, **p)
예제 #4
0
 def get_annotation_manager(self, task, data_connection, sync=False):
     self._exists(task)
     self._check_type(task, 'annotation')
     data = self.db.hgetall(self._task_prefix + task)
     p = json.loads(data['params'])
     ps = json.loads(data['_params'])
     p['sync'] = sync
     p['secret'] = str(ps['secret'])
     p['redis_address'] = self.annotation_redis_host
     p['redis_port'] = int(self.annotation_redis_port)
     p['task_key'] = task
     # TODO: Currently only compatible with thrift based datastores
     if data_connection:
         data_connection = data_connection._thrift
     return mturk_vision.manager(data=str(ps['data']),
                                 data_connection=data_connection,
                                 **p)
예제 #5
0
 def get_manager(self, task):
     # TODO: Need to ensure no races due to a worker running while the DB is destroyed
     #       can fix by putting a unique key in state_db, that is verified each call
     #       but needs to lock the race down (check and set?)
     data = self.db.hgetall(self.annotator_prefix + task)
     p = json.loads(data['params'])
     p['setup'] = False
     p['reset'] = False
     p['secret'] = data['_secret']
     redis_host, redis_port = data['_redis_host_port'].split(':')
     p['redis_address'] = redis_host
     p['redis_port'] = int(redis_port)
     # This ensures that the task still exists before we reuse the cache
     try:
         return self.cache[task]
     except KeyError:
         self.cache[task] = mturk_vision.manager(data=data['_data'], **p)
         return self.cache[task]
예제 #6
0
def server(**args):
    global MANAGER, SERVER
    MANAGER = mturk_vision.manager(**args)
    SERVER = gevent.pywsgi.WSGIServer(('0.0.0.0', int(args['port'])), bottle.app())
    SERVER.serve_forever()
예제 #7
0
 def post_slice(self, start_row, stop_row, params, files):
     action = params['action']
     with thrift_lock() as thrift:
         manager = PicarusManager(thrift=thrift)
         if action == 'io/thumbnail':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.image_thumbnail(start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/exif':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.image_exif(start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/preprocess':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.image_preprocessor(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/classify':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.feature_to_prediction(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/feature':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.takeout_link_job(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/link':
             self._slice_validate(start_row, stop_row, 'rw')
             model_key = base64.urlsafe_b64decode(params['model'])
             chain_input, model_link = _takeout_model_link_from_key(manager, model_key)
             manager.takeout_chain_job([model_link], chain_input, model_key, start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/chain':
             self._slice_validate(start_row, stop_row, 'rw')
             model_key = base64.urlsafe_b64decode(params['model'])
             chain_inputs, model_chain = zip(*_takeout_model_chain_from_key(manager, model_key))
             manager.takeout_chain_job(list(model_chain), chain_inputs[0], model_key, start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/hash':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.feature_to_hash(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'i/dedupe/identical':
             self._slice_validate(start_row, stop_row, 'r')
             col = base64.urlsafe_b64decode(params['column'])
             features = {}
             dedupe_feature = lambda x, y: features.setdefault(base64.b64encode(hashlib.md5(y).digest()), []).append(base64.urlsafe_b64encode(x))
             for cur_row, cur_col in hadoopy_hbase.scanner_row_column(thrift, self.table, column=col,
                                                                      start_row=start_row, per_call=10,
                                                                      stop_row=stop_row):
                 dedupe_feature(cur_row, cur_col)
             bottle.response.headers["Content-type"] = "application/json"
             return json.dumps([{'rows': y} for x, y in features.items() if len(y) > 1])
         elif action == 'o/crawl/flickr':
             self._slice_validate(start_row, stop_row, 'w')
             # Only slices where the start_row can be used as a prefix may be used
             assert start_row and ord(start_row[-1]) != 255 and start_row[:-1] + chr(ord(start_row[-1]) + 1) == stop_row
             p = {}
             row_prefix = start_row
             assert row_prefix.find(':') != -1
             class_name = params['className']
             query = params.get('query')
             query = class_name if query is None else query
             p['lat'] = query = params.get('lat')
             p['lon'] = query = params.get('lon')
             p['radius'] = query = params.get('radius')
             p['api_key'] = params.get('apiKey', FLICKR_API_KEY)
             p['api_secret'] = params.get('apiSecret', FLICKR_API_SECRET)
             if 'hasGeo' in params:
                 p['has_geo'] = params['hasGeo'] == '1'
             try:
                 p['min_upload_date'] = int(params['minUploadDate'])
             except KeyError:
                 pass
             try:
                 p['max_upload_date'] = int(params['maxUploadDate'])
             except KeyError:
                 pass
             try:
                 p['page'] = int(params['page'])
             except KeyError:
                 pass
             return {'numRows': crawlers.flickr_crawl(crawlers.HBaseCrawlerStore(thrift, row_prefix), class_name, query, **p)}
         elif action in ('io/annotate/image/query', 'io/annotate/image/entity', 'io/annotate/image/query_batch'):
             self._slice_validate(start_row, stop_row, 'r')
             secret = base64.urlsafe_b64encode(uuid.uuid4().bytes)[:-2]
             task = base64.urlsafe_b64encode(uuid.uuid4().bytes)[:-2]
             p = {}
             image_column = base64.urlsafe_b64decode(params['imageColumn'])
             if action == 'io/annotate/image/entity':
                 entity_column = base64.urlsafe_b64decode(params['entityColumn'])
                 data = 'hbase://localhost:9090/images/%s/%s?entity=%s&image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row),
                                                                                    entity_column, image_column)
                 p['type'] = 'image_entity'
             elif action == 'io/annotate/image/query':
                 query = params['query']
                 data = 'hbase://localhost:9090/images/%s/%s?image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row), image_column)
                 p['type'] = 'image_query'
                 p['query'] = query
             elif action == 'io/annotate/image/query_batch':
                 query = params['query']
                 data = 'hbase://localhost:9090/images/%s/%s?image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row), image_column)
                 p['type'] = 'image_query_batch'
                 p['query'] = query
             else:
                 bottle.abort(400)
             p['num_tasks'] = 100
             p['mode'] = 'standalone'
             try:
                 redis_host, redis_port = ANNOTATORS.add_task(task, self.owner, secret, data, p).split(':')
             except annotators.CapacityException:
                 bottle.abort(503)
             p['setup'] = True
             p['reset'] = True
             p['secret'] = secret
             p['redis_address'] = redis_host
             p['redis_port'] = int(redis_port)
             mturk_vision.manager(data=data, **p)
             return {'task': task}
         else:
             bottle.abort(400)