def post_slice(self, start_row, stop_row, params, files): action = params['action'] with thrift_lock() as thrift: manager = PicarusManager(thrift=thrift) if action == 'io/thumbnail': self._slice_validate(start_row, stop_row, 'rw') manager.image_thumbnail(start_row=start_row, stop_row=stop_row) return {} elif action == 'io/exif': self._slice_validate(start_row, stop_row, 'rw') manager.image_exif(start_row=start_row, stop_row=stop_row) return {} elif action == 'io/preprocess': self._slice_validate(start_row, stop_row, 'rw') manager.image_preprocessor(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row) return {} elif action == 'io/classify': self._slice_validate(start_row, stop_row, 'rw') manager.feature_to_prediction(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row) return {} elif action == 'io/feature': self._slice_validate(start_row, stop_row, 'rw') manager.takeout_link_job(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row) return {} elif action == 'io/link': self._slice_validate(start_row, stop_row, 'rw') model_key = base64.urlsafe_b64decode(params['model']) chain_input, model_link = _takeout_model_link_from_key(manager, model_key) manager.takeout_chain_job([model_link], chain_input, model_key, start_row=start_row, stop_row=stop_row) return {} elif action == 'io/chain': self._slice_validate(start_row, stop_row, 'rw') model_key = base64.urlsafe_b64decode(params['model']) chain_inputs, model_chain = zip(*_takeout_model_chain_from_key(manager, model_key)) manager.takeout_chain_job(list(model_chain), chain_inputs[0], model_key, start_row=start_row, stop_row=stop_row) return {} elif action == 'io/hash': self._slice_validate(start_row, stop_row, 'rw') manager.feature_to_hash(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row) return {} elif action == 'i/dedupe/identical': self._slice_validate(start_row, stop_row, 'r') col = base64.urlsafe_b64decode(params['column']) features = {} dedupe_feature = lambda x, y: features.setdefault(base64.b64encode(hashlib.md5(y).digest()), []).append(base64.urlsafe_b64encode(x)) for cur_row, cur_col in hadoopy_hbase.scanner_row_column(thrift, self.table, column=col, start_row=start_row, per_call=10, stop_row=stop_row): dedupe_feature(cur_row, cur_col) bottle.response.headers["Content-type"] = "application/json" return json.dumps([{'rows': y} for x, y in features.items() if len(y) > 1]) elif action == 'o/crawl/flickr': self._slice_validate(start_row, stop_row, 'w') # Only slices where the start_row can be used as a prefix may be used assert start_row and ord(start_row[-1]) != 255 and start_row[:-1] + chr(ord(start_row[-1]) + 1) == stop_row p = {} row_prefix = start_row assert row_prefix.find(':') != -1 class_name = params['className'] query = params.get('query') query = class_name if query is None else query p['lat'] = query = params.get('lat') p['lon'] = query = params.get('lon') p['radius'] = query = params.get('radius') p['api_key'] = params.get('apiKey', FLICKR_API_KEY) p['api_secret'] = params.get('apiSecret', FLICKR_API_SECRET) if 'hasGeo' in params: p['has_geo'] = params['hasGeo'] == '1' try: p['min_upload_date'] = int(params['minUploadDate']) except KeyError: pass try: p['max_upload_date'] = int(params['maxUploadDate']) except KeyError: pass try: p['page'] = int(params['page']) except KeyError: pass return {'numRows': crawlers.flickr_crawl(crawlers.HBaseCrawlerStore(thrift, row_prefix), class_name, query, **p)} elif action in ('io/annotate/image/query', 'io/annotate/image/entity', 'io/annotate/image/query_batch'): self._slice_validate(start_row, stop_row, 'r') secret = base64.urlsafe_b64encode(uuid.uuid4().bytes)[:-2] task = base64.urlsafe_b64encode(uuid.uuid4().bytes)[:-2] p = {} image_column = base64.urlsafe_b64decode(params['imageColumn']) if action == 'io/annotate/image/entity': entity_column = base64.urlsafe_b64decode(params['entityColumn']) data = 'hbase://localhost:9090/images/%s/%s?entity=%s&image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row), entity_column, image_column) p['type'] = 'image_entity' elif action == 'io/annotate/image/query': query = params['query'] data = 'hbase://localhost:9090/images/%s/%s?image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row), image_column) p['type'] = 'image_query' p['query'] = query elif action == 'io/annotate/image/query_batch': query = params['query'] data = 'hbase://localhost:9090/images/%s/%s?image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row), image_column) p['type'] = 'image_query_batch' p['query'] = query else: bottle.abort(400) p['num_tasks'] = 100 p['mode'] = 'standalone' try: redis_host, redis_port = ANNOTATORS.add_task(task, self.owner, secret, data, p).split(':') except annotators.CapacityException: bottle.abort(503) p['setup'] = True p['reset'] = True p['secret'] = secret p['redis_address'] = redis_host p['redis_port'] = int(redis_port) mturk_vision.manager(data=data, **p) return {'task': task} else: bottle.abort(400)
def flickr_job(self, params, start_row, stop_row, job_row): # Only slices where the start_row can be used as a prefix may be used assert start_row and ord( start_row[-1] ) != 255 and start_row[:-1] + chr(ord(start_row[-1]) + 1) == stop_row p = {} row_prefix = start_row p['class_name'] = params.get('className') p['query'] = params.get('query') p['lat'] = params.get('lat') p['lon'] = params.get('lon') if not ((p['lat'] and p['lon']) or p['query']): bottle.abort(400, 'Invalid crawler parameters') p['radius'] = params.get('radius') p['size'] = params.get('size', 'm') if p['size'] not in ('sq', 't', 's', 'q', 'm', 'n', 'z', 'c', 'l', 'o'): bottle.abort(400, 'Invalid size') p['api_key'] = params.get('apiKey', FLICKR_API_KEY) p['api_secret'] = params.get('apiSecret', FLICKR_API_SECRET) if not p['api_key'] or not p['api_secret']: bottle.abort( 400 ) # Either we don't have a default or the user provided an empty key if 'hasGeo' in params: p['has_geo'] = params['hasGeo'] == '1' if 'onePerOwner' in params: p['one_per_owner'] = params['onePerOwner'] == '1' try: p['max_rows'] = int(params['maxRows']) except KeyError: pass iterations = min(10000, int(params.get('iterations', 1))) try: min_upload_date = int(params['minUploadDate']) except KeyError: min_upload_date = None try: max_upload_date = int(params['maxUploadDate']) except KeyError: max_upload_date = None try: upload_date_radius = int(params['uploadDateRadius']) if max_upload_date is None: max_upload_date = int(time.time()) if min_upload_date is None: min_upload_date = 1232170610 except KeyError: upload_date_radius = None if max_upload_date is not None: p['max_upload_date'] = max_upload_date if min_upload_date is not None: p['min_upload_date'] = min_upload_date if iterations > 1 and upload_date_radius is None: bottle.abort( 400 ) # Need to specify a radius for multiple iters or else we will return same vals try: p['page'] = int(params['page']) except KeyError: pass job_columns = {'goodRows': 0, 'badRows': 0, 'status': 'running'} def store(crawl_kwargs, image, source, **kw): print('In store') # TODO: Need to extend onePerOwner to multiple iterations query = crawl_kwargs.get('query') class_name = crawl_kwargs.get('class_name') cols = {} md5 = lambda x: hashlib.md5(x).digest() cur_md5 = md5(image) cols['data:image'] = image if class_name is not None: cols['meta:class'] = class_name if query is not None: cols['meta:query'] = query cols['meta:source'] = source cols['hash:md5'] = cur_md5 for x, y in kw.items(): cols['meta:' + x] = y row = row_prefix + cur_md5 print('Pre mutate row') self.mutate_row('images', row, cols) print('Post mutate row') job_columns['goodRows'] += 1 self._jobs.update_task(job_row, job_columns) for n in range(iterations): print('Iter[%d]' % n) if upload_date_radius: p['min_upload_date'] = random.randint( min_upload_date, max_upload_date - upload_date_radius) p['max_upload_date'] = p['min_upload_date'] + upload_date_radius crawlers.flickr_crawl(store, **p) job_columns['status'] = 'completed' self._jobs.update_task(job_row, job_columns)
def post_slice(self, start_row, stop_row, params, files): if files: bottle.abort(400) params = {k: base64.b64decode(v) for k, v in params.items()} action = params['action'] with thrift_new() as thrift: manager = PicarusManager(thrift=thrift) if action == 'io/thumbnail': self._slice_validate(start_row, stop_row, 'rw') manager.image_thumbnail(start_row=start_row, stop_row=stop_row) return {} elif action == 'io/exif': self._slice_validate(start_row, stop_row, 'rw') manager.image_exif(start_row=start_row, stop_row=stop_row) return {} elif action == 'io/link': self._slice_validate(start_row, stop_row, 'rw') model_key = params['model'] chain_input, model_link = _takeout_input_model_link_from_key(manager, model_key) manager.takeout_chain_job([model_link], chain_input, model_key, start_row=start_row, stop_row=stop_row) return {} elif action == 'io/chain': self._slice_validate(start_row, stop_row, 'rw') model_key = params['model'] chain_inputs, model_chain = zip(*_takeout_input_model_chain_from_key(manager, model_key)) manager.takeout_chain_job(list(model_chain), chain_inputs[0], model_key, start_row=start_row, stop_row=stop_row) return {} elif action == 'i/faces': # TODO: Temporary, remove when done names = set(['George_W_Bush', 'Colin_Powell', 'Tony_Blair', 'Donald_Rumsfeld', 'Gerhard_Schroeder', 'Ariel_Sharon', 'Hugo_Chavez', 'Junichiro_Koizumi', 'Serena_Williams', 'John_Ashcroft']) self._slice_validate(start_row, stop_row, 'r') import cv2 r = None labels = {} pos = 0 neg = 0 data = [] lab = [] num_train = 2000 for n, (cur_row, cur_cols) in enumerate(hadoopy_hbase.scanner(thrift, self.table, start_row=start_row, per_call=10, stop_row=stop_row, columns=['data:image', 'meta:class'])): cur_class = cur_cols['meta:class'] if cur_class not in names: continue if cur_class not in labels: labels[cur_class] = len(labels) label = labels[cur_class] image = cv2.imdecode(np.fromstring(cur_cols['data:image'], np.uint8), 0) # Crop image = np.ascontiguousarray(image[62:-62, 62:-62]) #if n == 0: # cv2.imwrite('out.png', image) if n < num_train: lab.append(label) data.append(image) else: if r is None: r = cv2.createLBPHFaceRecognizer() r.train(data, np.array(lab)) print('TRAINED-----------------------') pred = r.predict(image)[0] print((pred, label)) if pred == label: pos += 1 else: neg += 1 print((cur_class, image.shape, n, pos, neg, pos / float(pos + neg + .00000001))) elif action == 'io/garbage': self._slice_validate(start_row, stop_row, 'rw') columns_removed = set() columns_kept = set() # TODO: Get all user models and save those too active_models = set() for cur_row, cur_cols in hadoopy_hbase.scanner(thrift, self.table, filter='KeyOnlyFilter()', start_row=start_row, per_call=10, stop_row=stop_row): for k in cur_cols.keys(): if not (k.startswith('meta:') or k.startswith('thum:') or k == 'data:image' or k in active_models): if k not in columns_removed: columns_removed.add(k) print(columns_removed) print(len(columns_removed)) else: if k not in columns_kept: columns_kept.add(k) print(columns_kept) print(len(columns_kept)) return {'columnsRemoved': list(columns_removed), 'columnsKept': list(columns_kept)} elif action == 'i/dedupe/identical': self._slice_validate(start_row, stop_row, 'r') col = params['column'] features = {} dedupe_feature = lambda x, y: features.setdefault(base64.b64encode(hashlib.md5(y).digest()), []).append(base64.b64encode(x)) for cur_row, cur_col in hadoopy_hbase.scanner_row_column(thrift, self.table, column=col, start_row=start_row, per_call=10, stop_row=stop_row): dedupe_feature(cur_row, cur_col) bottle.response.headers["Content-type"] = "application/json" return json.dumps([{'rows': y} for x, y in features.items() if len(y) > 1]) elif action == 'o/crawl/flickr': self._slice_validate(start_row, stop_row, 'w') # Only slices where the start_row can be used as a prefix may be used assert start_row and ord(start_row[-1]) != 255 and start_row[:-1] + chr(ord(start_row[-1]) + 1) == stop_row p = {} row_prefix = start_row assert row_prefix.find(':') != -1 print('params[%r]' % params) class_name = params.get('className') query = params['query'] p['lat'] = params.get('lat') p['lon'] = params.get('lon') p['radius'] = params.get('radius') p['api_key'] = params.get('apiKey', FLICKR_API_KEY) p['api_secret'] = params.get('apiSecret', FLICKR_API_SECRET) if not p['api_key'] or not p['api_secret']: bottle.abort(400) # Either we don't have a default or the user provided an empty key if 'hasGeo' in params: p['has_geo'] = params['hasGeo'] == '1' if 'onePerOwner' in params: p['one_per_owner'] = params['onePerOwner'] == '1' try: p['min_upload_date'] = int(params['minUploadDate']) except KeyError: pass try: p['max_rows'] = int(params['maxRows']) except KeyError: pass try: p['max_upload_date'] = int(params['maxUploadDate']) except KeyError: pass try: p['page'] = int(params['page']) except KeyError: pass return {'numRows': crawlers.flickr_crawl(crawlers.HBaseCrawlerStore(thrift, row_prefix), class_name=class_name, query=query, **p)} else: bottle.abort(400)