def Exists(self): """ Returns true if the uri exists and contains some part files. """ # To solve the problem of detecting the PERT as existing when some of the # shards are not yet there needs to be addressed. A general solution would # indicate how many part files are expected either in the part filename or # inside the file payload. # A less general, but simpler change is to handle the case when the output # is being put in a directory by the MR framework which creates a "flag" # file names _SUCCESS when all part files are deposited. if not py_pert.Exists(self.uri): return False # Check if we are in a directory being generated by mr and check for _SUCCESS file resource_generated_by_mr = py_pert.Exists(self.uri + '/_logs') if resource_generated_by_mr: #LOG(INFO, 'pert uses mr type exist rules: %s' % self.uri) mr_resource_done = py_pert.Exists(self.uri + '/_SUCCESS') if not mr_resource_done: return False # Otherwise, check that shards are there and there are none missing between # largest and smallest part id # TODO(heathkh): We can still have an error where the part with largest ids # are missing and we can't tell because the filenames don't indicate how # many are in the set in total shards = py_pert.GetShardUris(self.uri) if not shards or not py_pert.ShardSetIsValid(shards): return False return True
def test_CopyLocalToUri(): local_uri = "local://tmp/data/test_ufs.pert"; remote_uri = "maprfs://data/tmp/test_ufs.pert"; CreateTestFile(local_uri) ok, scheme, path, error = py_pert.ParseUri(local_uri) CHECK(ok) py_pert.CopyLocalToUri(path, remote_uri) CHECK(py_pert.Exists(local_uri)) CHECK(py_pert.Exists(remote_uri)) reader = py_pert.StringTableReader() reader.Open(remote_uri) expected_count = 1000 count = 0 for (key, value), (expected_key, expected_value) in zip(reader, GenerateTestData()): CHECK_EQ(key, expected_key) CHECK_EQ(value, expected_value) count += 1 CHECK_EQ(count, expected_count) print py_pert.ListDirectory(local_uri) print py_pert.ListDirectory(remote_uri) return
def EnsureChunkSizeForUri(uri, desired_block_size): CHECK_EQ(desired_block_size % (2**16), 0) # must be a multiple of 2**16 CHECK(py_pert.Exists(uri), 'expected uri to exist: %s' % uri) CHECK(py_pert.IsFile(uri), 'expected uri to be a file: %s' % uri) ok, actual_chunk_size = py_pert.ChunkSize(uri) CHECK(ok) CHECK_EQ( desired_block_size, actual_chunk_size, 'Expected chunk size of %d but actual chunk size is %d for uri: %s' % (desired_block_size, actual_chunk_size, uri)) return True
def GetChunkSizeForUri(uri): CHECK(py_pert.Exists(uri), 'expected uri to exist: %s' % uri) CHECK( py_pert.IsDirectory(uri), 'Chunk size only defined for directories... See mapr docs for details') nfs_path = mr.UriToNfsPath(uri) dfs_attribute_path = '%s/.dfs_attributes' % (nfs_path) lines = open(dfs_attribute_path, 'r').readlines() #print lines tokens = lines[2].split('=') CHECK_EQ(tokens[0], 'ChunkSize') chunksize = long(tokens[1]) return chunksize
def test_OpenSplit(): remote_uri = "maprfs://data/itergraph/tide_v13/photoid_to_image.pert/part-00046" CHECK(py_pert.Exists(remote_uri)) reader = py_pert.StringTableShardReader() split_start = 4598228 split_length = 1255113 split_end = split_start + split_length reader.OpenSplit(remote_uri, split_start, split_end) count = 0 for key, value in reader: count += 1 print count return
def Run(self): print 'about to run pipes flow: %s' % (self.pipes_binary) mr_driver = self.MakeDriver() # set output directory property to create files with required chunk size if self.output_chunk_size_bytes != None: if not py_pert.Exists(self.output_path): nfs_path = mr.UriToNfsPath(self.output_path) os.makedirs(nfs_path) SetChunkSizeForUri(self.output_path, self.output_chunk_size_bytes) CHECK_EQ( GetChunkSizeForUri(self.output_path), self.output_chunk_size_bytes ) # verify the features file will have a block size of 4 GB status = mr_driver.Run() # ensure output was created with the required chunk size if self.output_chunk_size_bytes != None: # ensure the created output has the requested chunk size for uri in py_pert.GetShardUris(self.output_path): EnsureChunkSizeForUri(uri, self.output_chunk_size_bytes) return status
def SetChunkSizeForUri(uri, block_size): CHECK_EQ(block_size % (2**16), 0) # must be a multiple of 2**16 CHECK_LE( block_size, 1024 * (2**20), 'Currently libmaprfs has a limitation that prevents chunk sizes greater than 1GB.' ) CHECK(py_pert.Exists(uri), 'expected uri to exist: %s' % uri) CHECK( py_pert.IsDirectory(uri), 'Chunk size only defined for directories... See mapr docs for details') nfs_path = mr.UriToNfsPath(uri) dfs_attribute_path = '%s/.dfs_attributes' % (nfs_path) control_file = open(dfs_attribute_path, 'w') control_file.write( '# lines beginning with # are treated as comments\nCompression=true\nChunkSize=%d' % (block_size)) control_file.close() new_block_size = GetChunkSizeForUri(uri) CHECK_EQ(new_block_size, block_size) return True
def test_CopyLocalToUri(): CHECK(py_pert.Exists(fingerprint_uri)) input_file = py_pert.OpenInput(fingerprint_uri) ok, fingerprint = input_file.ReadToString()
def Exists(self): return py_pert.Exists(self.uri)
def CheckUriExistsOrDie(uri): CHECK(py_pert.Exists(uri), 'expected uri to exist: %s' % uri) return