예제 #1
0
 def post_table(self, params, files):
     if files:
         bottle.abort(400, 'Table does not support files')
     params = dict((base64.b64decode(k), base64.b64decode(v))
                   for k, v in params.items())
     path = params['path']
     with thrift_lock() as thrift:
         manager = PicarusManager(db=thrift)
         if path.startswith('model/'):
             return _create_model_from_params(manager, self.owner, path,
                                              params)
         elif path.startswith('factory/'):
             table = params['table']
             start_stop_rows = parse_slices()
             data_table = get_table(self._auth_user, table)
             for start_row, stop_row in start_stop_rows:
                 data_table._slice_validate(start_row, stop_row, 'r')
             try:
                 factory = FACTORIES[path]
             except KeyError:
                 bottle.abort(400, 'Invalid parameter value [path]')
             slices = [
                 base64.b64encode(start_row) + ',' +
                 base64.b64encode(stop_row)
                 for start_row, stop_row in start_stop_rows
             ]
             job_row = JOBS.add_task('model', self.owner, {
                 'slices': ';'.join(slices),
                 'table': self.table,
                 'path': path
             }, {})
             return _create_model_from_factory(self.owner, thrift, path,
                                               factory, params,
                                               start_stop_rows,
                                               data_table.table, job_row)
예제 #2
0
 def post_slice(self, start_row, stop_row, params, files):
     if files:
         bottle.abort(400, 'Table does not support files')
     params = dict((k, base64.b64decode(v)) for k, v in params.items())
     action = params['action']
     with thrift_lock() as thrift:
         manager = PicarusManager(db=thrift)
         if action == 'io/thumbnail':
             self._slice_validate(start_row, stop_row, 'rw')
             # Makes 150x150 thumbnails from the data:image column
             model = [{
                 'name': 'picarus.ImagePreprocessor',
                 'kw': {
                     'method': 'force_square',
                     'size': 150,
                     'compression': 'jpg'
                 }
             }]
             job_row = JOBS.add_task(
                 'process', self.owner, {
                     'startRow': base64.b64encode(start_row),
                     'stopRow': base64.b64encode(stop_row),
                     'table': self.table,
                     'action': action
                 }, {})
             thrift.takeout_chain_job('images',
                                      model,
                                      'data:image',
                                      'thum:image_150sq',
                                      start_row=start_row,
                                      stop_row=stop_row,
                                      job_row=job_row)
             return dict((base64.b64encode(k), base64.b64encode(v))
                         for k, v in {
                             'row': job_row,
                             'table': 'jobs'
                         }.items())
         elif action == 'io/exif':
             self._slice_validate(start_row, stop_row, 'rw')
             job_row = JOBS.add_task(
                 'process', self.owner, {
                     'startRow': base64.b64encode(start_row),
                     'stopRow': base64.b64encode(stop_row),
                     'table': self.table,
                     'action': action
                 }, {})
             thrift.exif_job(start_row=start_row,
                             stop_row=stop_row,
                             job_row=job_row)
             return dict((base64.b64encode(k), base64.b64encode(v))
                         for k, v in {
                             'row': job_row,
                             'table': 'jobs'
                         }.items())
         elif action == 'io/copy':
             self._slice_validate(start_row, stop_row, 'rw')
             input_column = params['inputColumn']
             output_column = params['outputColumn']
             self._column_write_validate(output_column)
             job_row = JOBS.add_task(
                 'process', self.owner, {
                     'startRow': base64.b64encode(start_row),
                     'stopRow': base64.b64encode(stop_row),
                     'inputColumn': base64.b64encode(input_column),
                     'outputColumn': base64.b64encode(output_column),
                     'table': self.table,
                     'action': action
                 }, {})
             thrift.copy_job('images',
                             input_column=input_column,
                             output_column=output_column,
                             start_row=start_row,
                             stop_row=stop_row,
                             job_row=job_row)
             return dict((base64.b64encode(k), base64.b64encode(v))
                         for k, v in {
                             'row': job_row,
                             'table': 'jobs'
                         }.items())
         elif action == 'io/link':
             self._slice_validate(start_row, stop_row, 'rw')
             model_key = params['model']
             chain_input, model_link = _takeout_input_model_link_from_key(
                 manager, model_key)
             job_row = JOBS.add_task(
                 'process', self.owner, {
                     'startRow': base64.b64encode(start_row),
                     'stopRow': base64.b64encode(stop_row),
                     'table': self.table,
                     'action': action
                 }, {})
             thrift.takeout_chain_job('images', [model_link],
                                      chain_input,
                                      model_key,
                                      start_row=start_row,
                                      stop_row=stop_row,
                                      job_row=job_row)
             return dict((base64.b64encode(k), base64.b64encode(v))
                         for k, v in {
                             'row': job_row,
                             'table': 'jobs'
                         }.items())
         elif action == 'io/chain':
             self._slice_validate(start_row, stop_row, 'rw')
             model_key = params['model']
             chain_inputs, model_chain = zip(
                 *_takeout_input_model_chain_from_key(manager, model_key))
             job_row = JOBS.add_task(
                 'process', self.owner, {
                     'startRow': base64.b64encode(start_row),
                     'stopRow': base64.b64encode(stop_row),
                     'table': self.table,
                     'action': action
                 }, {})
             thrift.takeout_chain_job('images',
                                      list(model_chain),
                                      chain_inputs[0],
                                      model_key,
                                      start_row=start_row,
                                      stop_row=stop_row,
                                      job_row=job_row)
             return dict((base64.b64encode(k), base64.b64encode(v))
                         for k, v in {
                             'row': job_row,
                             'table': 'jobs'
                         }.items())
         elif action == 'o/crawl/flickr':
             self._slice_validate(start_row, stop_row, 'w')
             job_row = JOBS.add_task(
                 'crawl', self.owner, {
                     'startRow': base64.b64encode(start_row),
                     'stopRow': base64.b64encode(stop_row),
                     'table': self.table,
                     'action': action
                 }, {})
             thrift.flickr_job(params, start_row, stop_row, job_row)
             return dict((base64.b64encode(k), base64.b64encode(v))
                         for k, v in {
                             'row': job_row,
                             'table': 'jobs'
                         }.items())
         elif action == 'o/crawl/streetview':
             self._slice_validate(start_row, stop_row, 'w')
             job_row = JOBS.add_task(
                 'crawl', self.owner, {
                     'startRow': base64.b64encode(start_row),
                     'stopRow': base64.b64encode(stop_row),
                     'table': self.table,
                     'action': action
                 }, {})
             thrift.street_view_job(params, start_row, stop_row, job_row)
             return dict((base64.b64encode(k), base64.b64encode(v))
                         for k, v in {
                             'row': job_row,
                             'table': 'jobs'
                         }.items())
         else:
             bottle.abort(400, 'Invalid parameter value [action]')
예제 #3
0
 def post_slice(self, start_row, stop_row, params, files):
     action = params['action']
     with thrift_lock() as thrift:
         manager = PicarusManager(thrift=thrift)
         if action == 'io/thumbnail':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.image_thumbnail(start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/exif':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.image_exif(start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/preprocess':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.image_preprocessor(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/classify':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.feature_to_prediction(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/feature':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.takeout_link_job(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/link':
             self._slice_validate(start_row, stop_row, 'rw')
             model_key = base64.urlsafe_b64decode(params['model'])
             chain_input, model_link = _takeout_model_link_from_key(manager, model_key)
             manager.takeout_chain_job([model_link], chain_input, model_key, start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/chain':
             self._slice_validate(start_row, stop_row, 'rw')
             model_key = base64.urlsafe_b64decode(params['model'])
             chain_inputs, model_chain = zip(*_takeout_model_chain_from_key(manager, model_key))
             manager.takeout_chain_job(list(model_chain), chain_inputs[0], model_key, start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/hash':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.feature_to_hash(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'i/dedupe/identical':
             self._slice_validate(start_row, stop_row, 'r')
             col = base64.urlsafe_b64decode(params['column'])
             features = {}
             dedupe_feature = lambda x, y: features.setdefault(base64.b64encode(hashlib.md5(y).digest()), []).append(base64.urlsafe_b64encode(x))
             for cur_row, cur_col in hadoopy_hbase.scanner_row_column(thrift, self.table, column=col,
                                                                      start_row=start_row, per_call=10,
                                                                      stop_row=stop_row):
                 dedupe_feature(cur_row, cur_col)
             bottle.response.headers["Content-type"] = "application/json"
             return json.dumps([{'rows': y} for x, y in features.items() if len(y) > 1])
         elif action == 'o/crawl/flickr':
             self._slice_validate(start_row, stop_row, 'w')
             # Only slices where the start_row can be used as a prefix may be used
             assert start_row and ord(start_row[-1]) != 255 and start_row[:-1] + chr(ord(start_row[-1]) + 1) == stop_row
             p = {}
             row_prefix = start_row
             assert row_prefix.find(':') != -1
             class_name = params['className']
             query = params.get('query')
             query = class_name if query is None else query
             p['lat'] = query = params.get('lat')
             p['lon'] = query = params.get('lon')
             p['radius'] = query = params.get('radius')
             p['api_key'] = params.get('apiKey', FLICKR_API_KEY)
             p['api_secret'] = params.get('apiSecret', FLICKR_API_SECRET)
             if 'hasGeo' in params:
                 p['has_geo'] = params['hasGeo'] == '1'
             try:
                 p['min_upload_date'] = int(params['minUploadDate'])
             except KeyError:
                 pass
             try:
                 p['max_upload_date'] = int(params['maxUploadDate'])
             except KeyError:
                 pass
             try:
                 p['page'] = int(params['page'])
             except KeyError:
                 pass
             return {'numRows': crawlers.flickr_crawl(crawlers.HBaseCrawlerStore(thrift, row_prefix), class_name, query, **p)}
         elif action in ('io/annotate/image/query', 'io/annotate/image/entity', 'io/annotate/image/query_batch'):
             self._slice_validate(start_row, stop_row, 'r')
             secret = base64.urlsafe_b64encode(uuid.uuid4().bytes)[:-2]
             task = base64.urlsafe_b64encode(uuid.uuid4().bytes)[:-2]
             p = {}
             image_column = base64.urlsafe_b64decode(params['imageColumn'])
             if action == 'io/annotate/image/entity':
                 entity_column = base64.urlsafe_b64decode(params['entityColumn'])
                 data = 'hbase://localhost:9090/images/%s/%s?entity=%s&image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row),
                                                                                    entity_column, image_column)
                 p['type'] = 'image_entity'
             elif action == 'io/annotate/image/query':
                 query = params['query']
                 data = 'hbase://localhost:9090/images/%s/%s?image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row), image_column)
                 p['type'] = 'image_query'
                 p['query'] = query
             elif action == 'io/annotate/image/query_batch':
                 query = params['query']
                 data = 'hbase://localhost:9090/images/%s/%s?image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row), image_column)
                 p['type'] = 'image_query_batch'
                 p['query'] = query
             else:
                 bottle.abort(400)
             p['num_tasks'] = 100
             p['mode'] = 'standalone'
             try:
                 redis_host, redis_port = ANNOTATORS.add_task(task, self.owner, secret, data, p).split(':')
             except annotators.CapacityException:
                 bottle.abort(503)
             p['setup'] = True
             p['reset'] = True
             p['secret'] = secret
             p['redis_address'] = redis_host
             p['redis_port'] = int(redis_port)
             mturk_vision.manager(data=data, **p)
             return {'task': task}
         else:
             bottle.abort(400)
예제 #4
0
 def post_row(self, row, params, files):
     if files:
         bottle.abort(400, 'Table does not support files')
     params = dict((k, base64.b64decode(v)) for k, v in params.items())
     action = params['action']
     with thrift_lock() as thrift:
         manager = PicarusManager(db=thrift)
         if action in ('i/link', 'i/chain', 'io/link', 'io/chain'):
             model_key = params['model']
             write_result = action.startswith('io/')
             if write_result:
                 self._row_validate(row, 'rw')
             else:
                 self._row_validate(row, 'r')
             if action.endswith('/link'):
                 chain_input, model_link = _takeout_input_model_link_from_key(
                     manager, model_key)
                 binary_input = thrift.get_column(self.table, row,
                                                  chain_input)
                 model = picarus_takeout.ModelChain(
                     msgpack.dumps([model_link]))
             else:
                 chain_inputs, model_chain = zip(
                     *_takeout_input_model_chain_from_key(
                         manager, model_key))
                 binary_input = thrift.get_column(self.table, row,
                                                  chain_inputs[0])
                 model = picarus_takeout.ModelChain(
                     msgpack.dumps(list(model_chain)))
             bottle.response.headers["Content-type"] = "application/json"
             model_out = model.process_binary(binary_input)
             if write_result:
                 thrift.mutate_row(self.table, row, {model_key: model_out})
             return json.dumps(
                 {base64.b64encode(model_key): base64.b64encode(model_out)})
         elif action in ('io/thumbnail', 'i/thumbnail'):
             # TODO: Refactor this, it shares code with link/chain
             write_result = action.startswith('io/')
             if write_result:
                 self._row_validate(row, 'rw')
             else:
                 self._row_validate(row, 'r')
             # Makes 150x150 thumbnails from the data:image column
             model_chain = [{
                 'name': 'picarus.ImagePreprocessor',
                 'kw': {
                     'method': 'force_square',
                     'size': 150,
                     'compression': 'jpg'
                 }
             }]
             model = picarus_takeout.ModelChain(
                 msgpack.dumps(list(model_chain)))
             bottle.response.headers["Content-type"] = "application/json"
             model_out = model.process_binary(
                 thrift.get_column(self.table, row, 'data:image'))
             if write_result:
                 thrift.mutate_row(self.table, row,
                                   {'thum:image_150sq': model_out})
             return json.dumps({
                 base64.b64encode('thum:image_150sq'):
                 base64.b64encode(model_out)
             })
         else:
             bottle.abort(400, 'Invalid parameter value [action]')
예제 #5
0
파일: tables.py 프로젝트: sanchan/picarus
 def post_slice(self, start_row, stop_row, params, files):
     if files:
         bottle.abort(400)
     params = {k: base64.b64decode(v) for k, v in params.items()}
     action = params['action']
     with thrift_new() as thrift:
         manager = PicarusManager(thrift=thrift)
         if action == 'io/thumbnail':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.image_thumbnail(start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/exif':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.image_exif(start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/link':
             self._slice_validate(start_row, stop_row, 'rw')
             model_key = params['model']
             chain_input, model_link = _takeout_input_model_link_from_key(manager, model_key)
             manager.takeout_chain_job([model_link], chain_input, model_key, start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/chain':
             self._slice_validate(start_row, stop_row, 'rw')
             model_key = params['model']
             chain_inputs, model_chain = zip(*_takeout_input_model_chain_from_key(manager, model_key))
             manager.takeout_chain_job(list(model_chain), chain_inputs[0], model_key, start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'i/faces':
             # TODO: Temporary, remove when done
             names = set(['George_W_Bush', 'Colin_Powell', 'Tony_Blair', 'Donald_Rumsfeld', 'Gerhard_Schroeder',
                          'Ariel_Sharon', 'Hugo_Chavez', 'Junichiro_Koizumi', 'Serena_Williams', 'John_Ashcroft'])
             self._slice_validate(start_row, stop_row, 'r')
             import cv2
             r = None
             labels = {}
             pos = 0
             neg = 0
             data = []
             lab = []
             num_train = 2000
             for n, (cur_row, cur_cols) in enumerate(hadoopy_hbase.scanner(thrift, self.table,
                                                                           start_row=start_row, per_call=10,
                                                                           stop_row=stop_row, columns=['data:image', 'meta:class'])):
                 cur_class = cur_cols['meta:class']
                 if cur_class not in names:
                     continue
                 if cur_class not in labels:
                     labels[cur_class] = len(labels)
                 label = labels[cur_class]
                 image = cv2.imdecode(np.fromstring(cur_cols['data:image'], np.uint8), 0)
                 # Crop
                 image = np.ascontiguousarray(image[62:-62, 62:-62])
                 #if n == 0:
                 #    cv2.imwrite('out.png', image)
                 if n < num_train:
                     lab.append(label)
                     data.append(image)
                 else:
                     if r is None:
                         r = cv2.createLBPHFaceRecognizer()
                         r.train(data, np.array(lab))
                         print('TRAINED-----------------------')
                     pred = r.predict(image)[0]
                     print((pred, label))
                     if pred == label:
                         pos += 1
                     else:
                         neg += 1
                 print((cur_class, image.shape, n, pos, neg, pos / float(pos + neg + .00000001)))
         elif action == 'io/garbage':
             self._slice_validate(start_row, stop_row, 'rw')
             columns_removed = set()
             columns_kept = set()
             # TODO: Get all user models and save those too
             active_models = set()
             for cur_row, cur_cols in hadoopy_hbase.scanner(thrift, self.table, filter='KeyOnlyFilter()',
                                                            start_row=start_row, per_call=10,
                                                            stop_row=stop_row):
                 for k in cur_cols.keys():
                     if not (k.startswith('meta:') or k.startswith('thum:') or k == 'data:image' or k in active_models):
                         if k not in columns_removed:
                             columns_removed.add(k)
                             print(columns_removed)
                             print(len(columns_removed))
                     else:
                         if k not in columns_kept:
                             columns_kept.add(k)
                             print(columns_kept)
                             print(len(columns_kept))
             return {'columnsRemoved': list(columns_removed), 'columnsKept': list(columns_kept)}
         elif action == 'i/dedupe/identical':
             self._slice_validate(start_row, stop_row, 'r')
             col = params['column']
             features = {}
             dedupe_feature = lambda x, y: features.setdefault(base64.b64encode(hashlib.md5(y).digest()), []).append(base64.b64encode(x))
             for cur_row, cur_col in hadoopy_hbase.scanner_row_column(thrift, self.table, column=col,
                                                                      start_row=start_row, per_call=10,
                                                                      stop_row=stop_row):
                 dedupe_feature(cur_row, cur_col)
             bottle.response.headers["Content-type"] = "application/json"
             return json.dumps([{'rows': y} for x, y in features.items() if len(y) > 1])
         elif action == 'o/crawl/flickr':
             self._slice_validate(start_row, stop_row, 'w')
             # Only slices where the start_row can be used as a prefix may be used
             assert start_row and ord(start_row[-1]) != 255 and start_row[:-1] + chr(ord(start_row[-1]) + 1) == stop_row
             p = {}
             row_prefix = start_row
             assert row_prefix.find(':') != -1
             print('params[%r]' % params)
             class_name = params.get('className')
             query = params['query']
             p['lat'] = params.get('lat')
             p['lon'] = params.get('lon')
             p['radius'] = params.get('radius')
             p['api_key'] = params.get('apiKey', FLICKR_API_KEY)
             p['api_secret'] = params.get('apiSecret', FLICKR_API_SECRET)
             if not p['api_key'] or not p['api_secret']:
                 bottle.abort(400)  # Either we don't have a default or the user provided an empty key
             if 'hasGeo' in params:
                 p['has_geo'] = params['hasGeo'] == '1'
             if 'onePerOwner' in params:
                 p['one_per_owner'] = params['onePerOwner'] == '1'
             try:
                 p['min_upload_date'] = int(params['minUploadDate'])
             except KeyError:
                 pass
             try:
                 p['max_rows'] = int(params['maxRows'])
             except KeyError:
                 pass
             try:
                 p['max_upload_date'] = int(params['maxUploadDate'])
             except KeyError:
                 pass
             try:
                 p['page'] = int(params['page'])
             except KeyError:
                 pass
             return {'numRows': crawlers.flickr_crawl(crawlers.HBaseCrawlerStore(thrift, row_prefix), class_name=class_name, query=query, **p)}
         else:
             bottle.abort(400)