def post_table(self, params, files): if files: bottle.abort(400, 'Table does not support files') params = dict((base64.b64decode(k), base64.b64decode(v)) for k, v in params.items()) path = params['path'] with thrift_lock() as thrift: manager = PicarusManager(db=thrift) if path.startswith('model/'): return _create_model_from_params(manager, self.owner, path, params) elif path.startswith('factory/'): table = params['table'] start_stop_rows = parse_slices() data_table = get_table(self._auth_user, table) for start_row, stop_row in start_stop_rows: data_table._slice_validate(start_row, stop_row, 'r') try: factory = FACTORIES[path] except KeyError: bottle.abort(400, 'Invalid parameter value [path]') slices = [ base64.b64encode(start_row) + ',' + base64.b64encode(stop_row) for start_row, stop_row in start_stop_rows ] job_row = JOBS.add_task('model', self.owner, { 'slices': ';'.join(slices), 'table': self.table, 'path': path }, {}) return _create_model_from_factory(self.owner, thrift, path, factory, params, start_stop_rows, data_table.table, job_row)
def post_slice(self, start_row, stop_row, params, files): if files: bottle.abort(400, 'Table does not support files') params = dict((k, base64.b64decode(v)) for k, v in params.items()) action = params['action'] with thrift_lock() as thrift: manager = PicarusManager(db=thrift) if action == 'io/thumbnail': self._slice_validate(start_row, stop_row, 'rw') # Makes 150x150 thumbnails from the data:image column model = [{ 'name': 'picarus.ImagePreprocessor', 'kw': { 'method': 'force_square', 'size': 150, 'compression': 'jpg' } }] job_row = JOBS.add_task( 'process', self.owner, { 'startRow': base64.b64encode(start_row), 'stopRow': base64.b64encode(stop_row), 'table': self.table, 'action': action }, {}) thrift.takeout_chain_job('images', model, 'data:image', 'thum:image_150sq', start_row=start_row, stop_row=stop_row, job_row=job_row) return dict((base64.b64encode(k), base64.b64encode(v)) for k, v in { 'row': job_row, 'table': 'jobs' }.items()) elif action == 'io/exif': self._slice_validate(start_row, stop_row, 'rw') job_row = JOBS.add_task( 'process', self.owner, { 'startRow': base64.b64encode(start_row), 'stopRow': base64.b64encode(stop_row), 'table': self.table, 'action': action }, {}) thrift.exif_job(start_row=start_row, stop_row=stop_row, job_row=job_row) return dict((base64.b64encode(k), base64.b64encode(v)) for k, v in { 'row': job_row, 'table': 'jobs' }.items()) elif action == 'io/copy': self._slice_validate(start_row, stop_row, 'rw') input_column = params['inputColumn'] output_column = params['outputColumn'] self._column_write_validate(output_column) job_row = JOBS.add_task( 'process', self.owner, { 'startRow': base64.b64encode(start_row), 'stopRow': base64.b64encode(stop_row), 'inputColumn': base64.b64encode(input_column), 'outputColumn': base64.b64encode(output_column), 'table': self.table, 'action': action }, {}) thrift.copy_job('images', input_column=input_column, output_column=output_column, start_row=start_row, stop_row=stop_row, job_row=job_row) return dict((base64.b64encode(k), base64.b64encode(v)) for k, v in { 'row': job_row, 'table': 'jobs' }.items()) elif action == 'io/link': self._slice_validate(start_row, stop_row, 'rw') model_key = params['model'] chain_input, model_link = _takeout_input_model_link_from_key( manager, model_key) job_row = JOBS.add_task( 'process', self.owner, { 'startRow': base64.b64encode(start_row), 'stopRow': base64.b64encode(stop_row), 'table': self.table, 'action': action }, {}) thrift.takeout_chain_job('images', [model_link], chain_input, model_key, start_row=start_row, stop_row=stop_row, job_row=job_row) return dict((base64.b64encode(k), base64.b64encode(v)) for k, v in { 'row': job_row, 'table': 'jobs' }.items()) elif action == 'io/chain': self._slice_validate(start_row, stop_row, 'rw') model_key = params['model'] chain_inputs, model_chain = zip( *_takeout_input_model_chain_from_key(manager, model_key)) job_row = JOBS.add_task( 'process', self.owner, { 'startRow': base64.b64encode(start_row), 'stopRow': base64.b64encode(stop_row), 'table': self.table, 'action': action }, {}) thrift.takeout_chain_job('images', list(model_chain), chain_inputs[0], model_key, start_row=start_row, stop_row=stop_row, job_row=job_row) return dict((base64.b64encode(k), base64.b64encode(v)) for k, v in { 'row': job_row, 'table': 'jobs' }.items()) elif action == 'o/crawl/flickr': self._slice_validate(start_row, stop_row, 'w') job_row = JOBS.add_task( 'crawl', self.owner, { 'startRow': base64.b64encode(start_row), 'stopRow': base64.b64encode(stop_row), 'table': self.table, 'action': action }, {}) thrift.flickr_job(params, start_row, stop_row, job_row) return dict((base64.b64encode(k), base64.b64encode(v)) for k, v in { 'row': job_row, 'table': 'jobs' }.items()) elif action == 'o/crawl/streetview': self._slice_validate(start_row, stop_row, 'w') job_row = JOBS.add_task( 'crawl', self.owner, { 'startRow': base64.b64encode(start_row), 'stopRow': base64.b64encode(stop_row), 'table': self.table, 'action': action }, {}) thrift.street_view_job(params, start_row, stop_row, job_row) return dict((base64.b64encode(k), base64.b64encode(v)) for k, v in { 'row': job_row, 'table': 'jobs' }.items()) else: bottle.abort(400, 'Invalid parameter value [action]')
def post_slice(self, start_row, stop_row, params, files): action = params['action'] with thrift_lock() as thrift: manager = PicarusManager(thrift=thrift) if action == 'io/thumbnail': self._slice_validate(start_row, stop_row, 'rw') manager.image_thumbnail(start_row=start_row, stop_row=stop_row) return {} elif action == 'io/exif': self._slice_validate(start_row, stop_row, 'rw') manager.image_exif(start_row=start_row, stop_row=stop_row) return {} elif action == 'io/preprocess': self._slice_validate(start_row, stop_row, 'rw') manager.image_preprocessor(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row) return {} elif action == 'io/classify': self._slice_validate(start_row, stop_row, 'rw') manager.feature_to_prediction(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row) return {} elif action == 'io/feature': self._slice_validate(start_row, stop_row, 'rw') manager.takeout_link_job(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row) return {} elif action == 'io/link': self._slice_validate(start_row, stop_row, 'rw') model_key = base64.urlsafe_b64decode(params['model']) chain_input, model_link = _takeout_model_link_from_key(manager, model_key) manager.takeout_chain_job([model_link], chain_input, model_key, start_row=start_row, stop_row=stop_row) return {} elif action == 'io/chain': self._slice_validate(start_row, stop_row, 'rw') model_key = base64.urlsafe_b64decode(params['model']) chain_inputs, model_chain = zip(*_takeout_model_chain_from_key(manager, model_key)) manager.takeout_chain_job(list(model_chain), chain_inputs[0], model_key, start_row=start_row, stop_row=stop_row) return {} elif action == 'io/hash': self._slice_validate(start_row, stop_row, 'rw') manager.feature_to_hash(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row) return {} elif action == 'i/dedupe/identical': self._slice_validate(start_row, stop_row, 'r') col = base64.urlsafe_b64decode(params['column']) features = {} dedupe_feature = lambda x, y: features.setdefault(base64.b64encode(hashlib.md5(y).digest()), []).append(base64.urlsafe_b64encode(x)) for cur_row, cur_col in hadoopy_hbase.scanner_row_column(thrift, self.table, column=col, start_row=start_row, per_call=10, stop_row=stop_row): dedupe_feature(cur_row, cur_col) bottle.response.headers["Content-type"] = "application/json" return json.dumps([{'rows': y} for x, y in features.items() if len(y) > 1]) elif action == 'o/crawl/flickr': self._slice_validate(start_row, stop_row, 'w') # Only slices where the start_row can be used as a prefix may be used assert start_row and ord(start_row[-1]) != 255 and start_row[:-1] + chr(ord(start_row[-1]) + 1) == stop_row p = {} row_prefix = start_row assert row_prefix.find(':') != -1 class_name = params['className'] query = params.get('query') query = class_name if query is None else query p['lat'] = query = params.get('lat') p['lon'] = query = params.get('lon') p['radius'] = query = params.get('radius') p['api_key'] = params.get('apiKey', FLICKR_API_KEY) p['api_secret'] = params.get('apiSecret', FLICKR_API_SECRET) if 'hasGeo' in params: p['has_geo'] = params['hasGeo'] == '1' try: p['min_upload_date'] = int(params['minUploadDate']) except KeyError: pass try: p['max_upload_date'] = int(params['maxUploadDate']) except KeyError: pass try: p['page'] = int(params['page']) except KeyError: pass return {'numRows': crawlers.flickr_crawl(crawlers.HBaseCrawlerStore(thrift, row_prefix), class_name, query, **p)} elif action in ('io/annotate/image/query', 'io/annotate/image/entity', 'io/annotate/image/query_batch'): self._slice_validate(start_row, stop_row, 'r') secret = base64.urlsafe_b64encode(uuid.uuid4().bytes)[:-2] task = base64.urlsafe_b64encode(uuid.uuid4().bytes)[:-2] p = {} image_column = base64.urlsafe_b64decode(params['imageColumn']) if action == 'io/annotate/image/entity': entity_column = base64.urlsafe_b64decode(params['entityColumn']) data = 'hbase://localhost:9090/images/%s/%s?entity=%s&image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row), entity_column, image_column) p['type'] = 'image_entity' elif action == 'io/annotate/image/query': query = params['query'] data = 'hbase://localhost:9090/images/%s/%s?image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row), image_column) p['type'] = 'image_query' p['query'] = query elif action == 'io/annotate/image/query_batch': query = params['query'] data = 'hbase://localhost:9090/images/%s/%s?image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row), image_column) p['type'] = 'image_query_batch' p['query'] = query else: bottle.abort(400) p['num_tasks'] = 100 p['mode'] = 'standalone' try: redis_host, redis_port = ANNOTATORS.add_task(task, self.owner, secret, data, p).split(':') except annotators.CapacityException: bottle.abort(503) p['setup'] = True p['reset'] = True p['secret'] = secret p['redis_address'] = redis_host p['redis_port'] = int(redis_port) mturk_vision.manager(data=data, **p) return {'task': task} else: bottle.abort(400)
def post_row(self, row, params, files): if files: bottle.abort(400, 'Table does not support files') params = dict((k, base64.b64decode(v)) for k, v in params.items()) action = params['action'] with thrift_lock() as thrift: manager = PicarusManager(db=thrift) if action in ('i/link', 'i/chain', 'io/link', 'io/chain'): model_key = params['model'] write_result = action.startswith('io/') if write_result: self._row_validate(row, 'rw') else: self._row_validate(row, 'r') if action.endswith('/link'): chain_input, model_link = _takeout_input_model_link_from_key( manager, model_key) binary_input = thrift.get_column(self.table, row, chain_input) model = picarus_takeout.ModelChain( msgpack.dumps([model_link])) else: chain_inputs, model_chain = zip( *_takeout_input_model_chain_from_key( manager, model_key)) binary_input = thrift.get_column(self.table, row, chain_inputs[0]) model = picarus_takeout.ModelChain( msgpack.dumps(list(model_chain))) bottle.response.headers["Content-type"] = "application/json" model_out = model.process_binary(binary_input) if write_result: thrift.mutate_row(self.table, row, {model_key: model_out}) return json.dumps( {base64.b64encode(model_key): base64.b64encode(model_out)}) elif action in ('io/thumbnail', 'i/thumbnail'): # TODO: Refactor this, it shares code with link/chain write_result = action.startswith('io/') if write_result: self._row_validate(row, 'rw') else: self._row_validate(row, 'r') # Makes 150x150 thumbnails from the data:image column model_chain = [{ 'name': 'picarus.ImagePreprocessor', 'kw': { 'method': 'force_square', 'size': 150, 'compression': 'jpg' } }] model = picarus_takeout.ModelChain( msgpack.dumps(list(model_chain))) bottle.response.headers["Content-type"] = "application/json" model_out = model.process_binary( thrift.get_column(self.table, row, 'data:image')) if write_result: thrift.mutate_row(self.table, row, {'thum:image_150sq': model_out}) return json.dumps({ base64.b64encode('thum:image_150sq'): base64.b64encode(model_out) }) else: bottle.abort(400, 'Invalid parameter value [action]')
def post_slice(self, start_row, stop_row, params, files): if files: bottle.abort(400) params = {k: base64.b64decode(v) for k, v in params.items()} action = params['action'] with thrift_new() as thrift: manager = PicarusManager(thrift=thrift) if action == 'io/thumbnail': self._slice_validate(start_row, stop_row, 'rw') manager.image_thumbnail(start_row=start_row, stop_row=stop_row) return {} elif action == 'io/exif': self._slice_validate(start_row, stop_row, 'rw') manager.image_exif(start_row=start_row, stop_row=stop_row) return {} elif action == 'io/link': self._slice_validate(start_row, stop_row, 'rw') model_key = params['model'] chain_input, model_link = _takeout_input_model_link_from_key(manager, model_key) manager.takeout_chain_job([model_link], chain_input, model_key, start_row=start_row, stop_row=stop_row) return {} elif action == 'io/chain': self._slice_validate(start_row, stop_row, 'rw') model_key = params['model'] chain_inputs, model_chain = zip(*_takeout_input_model_chain_from_key(manager, model_key)) manager.takeout_chain_job(list(model_chain), chain_inputs[0], model_key, start_row=start_row, stop_row=stop_row) return {} elif action == 'i/faces': # TODO: Temporary, remove when done names = set(['George_W_Bush', 'Colin_Powell', 'Tony_Blair', 'Donald_Rumsfeld', 'Gerhard_Schroeder', 'Ariel_Sharon', 'Hugo_Chavez', 'Junichiro_Koizumi', 'Serena_Williams', 'John_Ashcroft']) self._slice_validate(start_row, stop_row, 'r') import cv2 r = None labels = {} pos = 0 neg = 0 data = [] lab = [] num_train = 2000 for n, (cur_row, cur_cols) in enumerate(hadoopy_hbase.scanner(thrift, self.table, start_row=start_row, per_call=10, stop_row=stop_row, columns=['data:image', 'meta:class'])): cur_class = cur_cols['meta:class'] if cur_class not in names: continue if cur_class not in labels: labels[cur_class] = len(labels) label = labels[cur_class] image = cv2.imdecode(np.fromstring(cur_cols['data:image'], np.uint8), 0) # Crop image = np.ascontiguousarray(image[62:-62, 62:-62]) #if n == 0: # cv2.imwrite('out.png', image) if n < num_train: lab.append(label) data.append(image) else: if r is None: r = cv2.createLBPHFaceRecognizer() r.train(data, np.array(lab)) print('TRAINED-----------------------') pred = r.predict(image)[0] print((pred, label)) if pred == label: pos += 1 else: neg += 1 print((cur_class, image.shape, n, pos, neg, pos / float(pos + neg + .00000001))) elif action == 'io/garbage': self._slice_validate(start_row, stop_row, 'rw') columns_removed = set() columns_kept = set() # TODO: Get all user models and save those too active_models = set() for cur_row, cur_cols in hadoopy_hbase.scanner(thrift, self.table, filter='KeyOnlyFilter()', start_row=start_row, per_call=10, stop_row=stop_row): for k in cur_cols.keys(): if not (k.startswith('meta:') or k.startswith('thum:') or k == 'data:image' or k in active_models): if k not in columns_removed: columns_removed.add(k) print(columns_removed) print(len(columns_removed)) else: if k not in columns_kept: columns_kept.add(k) print(columns_kept) print(len(columns_kept)) return {'columnsRemoved': list(columns_removed), 'columnsKept': list(columns_kept)} elif action == 'i/dedupe/identical': self._slice_validate(start_row, stop_row, 'r') col = params['column'] features = {} dedupe_feature = lambda x, y: features.setdefault(base64.b64encode(hashlib.md5(y).digest()), []).append(base64.b64encode(x)) for cur_row, cur_col in hadoopy_hbase.scanner_row_column(thrift, self.table, column=col, start_row=start_row, per_call=10, stop_row=stop_row): dedupe_feature(cur_row, cur_col) bottle.response.headers["Content-type"] = "application/json" return json.dumps([{'rows': y} for x, y in features.items() if len(y) > 1]) elif action == 'o/crawl/flickr': self._slice_validate(start_row, stop_row, 'w') # Only slices where the start_row can be used as a prefix may be used assert start_row and ord(start_row[-1]) != 255 and start_row[:-1] + chr(ord(start_row[-1]) + 1) == stop_row p = {} row_prefix = start_row assert row_prefix.find(':') != -1 print('params[%r]' % params) class_name = params.get('className') query = params['query'] p['lat'] = params.get('lat') p['lon'] = params.get('lon') p['radius'] = params.get('radius') p['api_key'] = params.get('apiKey', FLICKR_API_KEY) p['api_secret'] = params.get('apiSecret', FLICKR_API_SECRET) if not p['api_key'] or not p['api_secret']: bottle.abort(400) # Either we don't have a default or the user provided an empty key if 'hasGeo' in params: p['has_geo'] = params['hasGeo'] == '1' if 'onePerOwner' in params: p['one_per_owner'] = params['onePerOwner'] == '1' try: p['min_upload_date'] = int(params['minUploadDate']) except KeyError: pass try: p['max_rows'] = int(params['maxRows']) except KeyError: pass try: p['max_upload_date'] = int(params['maxUploadDate']) except KeyError: pass try: p['page'] = int(params['page']) except KeyError: pass return {'numRows': crawlers.flickr_crawl(crawlers.HBaseCrawlerStore(thrift, row_prefix), class_name=class_name, query=query, **p)} else: bottle.abort(400)