def compute_db_hadoop(self, hdfs_path): import json si = picarus.api.SearchIndex() si.name = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) si.feature = json.dumps(self.feature_dict) # TODO: What to do with the pkl file? with hadoopy_helper.hdfs_temp() as hdfs_output: picarus.vision.run_image_clean(hdfs_path, hdfs_output + '/clean', max_side=self.max_side) # Compute features (map) picarus.vision.run_image_feature(hdfs_output + '/clean', hdfs_output + '/feature', self.feature_dict, files=self.required_files) # Random sample features for hashes (map) and train hasher (reduce) hadoopy.launch_frozen(hdfs_output + '/feature', hdfs_output + '/hasher', _lf('train_hasher.py'), cmdenvs={'KV_PROB': 1., 'HASH_BITS': 128}) hasher = hadoopy.readtb(hdfs_output + '/hasher').next()[1] si.hash = pickle.dumps(hasher, -1) si.hash_format = si.PICKLE # Compute features hashes (map) and build database (reduce) open('hasher.pkl', 'w').write(si.hash) hadoopy.launch_frozen(hdfs_output + '/feature', hdfs_output + '/db', _lf('build_db.py'), files=['hasher.pkl']) metadata, hashes = hadoopy.readtb(hdfs_output + '/db').next() self.metadata = metadata si.metadata.extend(metadata.tolist()) self.index = image_search.LinearHashDB().store_hashes(hashes, np.arange(len(metadata), dtype=np.uint64)) si.index = pickle.dumps(self.index, -1) si.index_format = si.PICKLE open('index.pb', 'w').write(si.SerializeToString())
def random_sample(hdfs_input, m, n=None, p=.01, hdfs_temp_dir=None): """Return an iterator of m kv pairs selected uniformly from the input Finds an alpha such that X = np.sum(np.random(n) < alpha) where X >= m with probability p. If more kv pairs are returned from Hadoop, then they are ignored. The resulting kv pairs are uniformly random from the input. Args: m: Desired number of samples (you will get this many as long as n >= m with probability (1-p)) n: Number of total values (default None uses count_kvs to compute this) p: Failure probability (default .01 means there is 1 failure out of 100 runs) Yields: Sample k/v pairs """ if n is None: n = count_kvs(hdfs_input) alpha = _random_sample_alpha(n, m, p=p) num_outputs = 0 with hadoopy_helper.hdfs_temp(hdfs_temp_dir=hdfs_temp_dir) as hdfs_output: hadoopy.launch_frozen(hdfs_input, hdfs_output, _lf('random_sample.py'), cmdenvs={'ALPHA': alpha}) for kv in hadoopy.readtb(hdfs_output): if num_outputs >= m: return yield kv num_outputs += 1 if num_outputs < m: logging.warn('random_sampler: num_outputs[%d] when m[%d]. To prevent this, call with a smaller value of p (currently [%f]).' % (num_outputs, m, p))
def main(): dense_path = 'exemplarbank/output/1341790878.92/pos' image_path = 'exemplarbank/data/sun_labelme_person/1-tr' image_box_fns = {} id_box_features = dict(hash_features(dense_path)) print id_box_features.items()[0] for (image_id, box), feature in id_box_features.items(): image_box_fns.setdefault(image_id, []).append((box, (image_id, box))) with open('image_box_fns.pkl', 'w') as fp: pickle.dump(image_box_fns, fp, -1) with hadoopy_helper.hdfs_temp() as hdfs_output: hadoopy.launch_frozen(image_path, hdfs_output, 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs=['TYPE=feature']) id_box_features2 = dict(hash_features(hdfs_output)) with open('compare.pkl', 'w') as fp: pickle.dump((id_box_features, id_box_features2), fp, -1)
def compute_db_hadoop(self, hdfs_path): import json si = picarus.api.SearchIndex() si.name = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) si.feature = json.dumps( self.feature_dict) # TODO: What to do with the pkl file? with hadoopy_helper.hdfs_temp() as hdfs_output: picarus.vision.run_image_clean(hdfs_path, hdfs_output + '/clean', max_side=self.max_side) # Compute features (map) picarus.vision.run_image_feature(hdfs_output + '/clean', hdfs_output + '/feature', self.feature_dict, files=self.required_files) # Random sample features for hashes (map) and train hasher (reduce) hadoopy.launch_frozen(hdfs_output + '/feature', hdfs_output + '/hasher', _lf('train_hasher.py'), cmdenvs={ 'KV_PROB': 1., 'HASH_BITS': 128 }) hasher = hadoopy.readtb(hdfs_output + '/hasher').next()[1] si.hash = pickle.dumps(hasher, -1) si.hash_format = si.PICKLE # Compute features hashes (map) and build database (reduce) open('hasher.pkl', 'w').write(si.hash) hadoopy.launch_frozen(hdfs_output + '/feature', hdfs_output + '/db', _lf('build_db.py'), files=['hasher.pkl']) metadata, hashes = hadoopy.readtb(hdfs_output + '/db').next() self.metadata = metadata si.metadata.extend(metadata.tolist()) self.index = image_search.LinearHashDB().store_hashes( hashes, np.arange(len(metadata), dtype=np.uint64)) si.index = pickle.dumps(self.index, -1) si.index_format = si.PICKLE open('index.pb', 'w').write(si.SerializeToString())
def launch_map_update(nodes, job_id, redis_host, jobconfs=None): jobconfs_base = {'mapred.map.tasks.speculative.execution': 'false', 'mapred.reduce.tasks.speculative.execution': 'false', 'mapred.task.timeout': '0'} if jobconfs: jobconfs_base.update(jobconfs) with hadoopy_helper.hdfs_temp() as input_path: for node in nodes: print(node) v = {'script_name': os.path.basename(node['script_path']), 'script_data': open(node['script_path']).read()} if 'cmdenvs' in node and node['cmdenvs'] is not None: v['cmdenvs'] = node['cmdenvs'] if 'files' in node and node['files'] is not None: v['files'] = dict((os.path.basename(f), open(f).read()) for f in node['files']) cmdenvs = {'job_id': job_id, 'hadoopy_rt_redis': redis_host} if 'outputs' in node and node['outputs']: v['outputs'] = node['outputs'] hadoopy.writetb('%s/input/%d' % (input_path, node['name']), [(node['name'], v)]) hadoopy.launch(input_path + '/input', input_path + '/output_path_empty', _lf('hadoopy_rt_job.py'), cmdenvs=cmdenvs, jobconfs=jobconfs_base)
def count_kvs(hdfs_input, hdfs_temp_dir=None): with hadoopy_helper.hdfs_temp(hdfs_temp_dir=hdfs_temp_dir) as hdfs_output: hadoopy.launch_frozen(hdfs_input, hdfs_output, _lf('count.py'), num_reducers=1) return sum(x for _, x in hadoopy.readtb(hdfs_output))
def unique_keys(hdfs_input, hdfs_temp_dir=None): with hadoopy_helper.hdfs_temp(hdfs_temp_dir=hdfs_temp_dir) as hdfs_output: hadoopy.launch_frozen(hdfs_input, hdfs_output, _lf('unique_keys.py')) for x in hadoopy.readtb(hdfs_output): yield x[0]