def generate_file_list(self): """Yield each individual path given a source folder and a set of file-matching expressions.""" for src in self.src: if src.startswith('s3'): # connect lazily as needed: if self.s3_conn is None: self.s3_conn = boto.connect_s3() for _bucket, _root, path in generate_s3_sources( self.s3_conn, src, self.include, self.include_zero_length): source = url_path_join(src, path) yield ExternalURL(source) elif src.startswith('hdfs'): for source, size in luigi.hdfs.listdir(src, recursive=True, include_size=True): if not self.include_zero_length and size == 0: continue elif any( fnmatch.fnmatch(source, include_val) for include_val in self.include): yield ExternalURL(source) else: # Apply the include patterns to the relative path below the src directory. # TODO: implement exclude_zero_length to match S3 case. for dirpath, _dirnames, files in os.walk(src): for filename in files: filepath = os.path.join(dirpath, filename) relpath = os.path.relpath(filepath, src) if any( fnmatch.fnmatch(relpath, include_val) for include_val in self.include): yield ExternalURL(filepath)
def list_s3_files(source_url, patterns): """List remote s3 files that match one of the patterns.""" s3_conn = connect_s3() for bucket, root, path in generate_s3_sources(s3_conn, source_url, patterns): source = join_as_s3_url(bucket, root, path) src_key = get_s3_key(s3_conn, source) print "%10d %s" % (src_key.size if src_key is not None else -1, path)
def generate_file_list(self): """Yield each individual path given a source folder and a set of file-matching expressions.""" for src in self.src: if src.startswith('s3'): # connect lazily as needed: if self.s3_conn is None: self.s3_conn = boto.connect_s3() for _bucket, _root, path in generate_s3_sources(self.s3_conn, src, self.include, self.include_zero_length): source = url_path_join(src, path) yield ExternalURL(source) elif src.startswith('hdfs'): for source, size in luigi.hdfs.listdir(src, recursive=True, include_size=True): if not self.include_zero_length and size == 0: continue elif any(fnmatch.fnmatch(source, include_val) for include_val in self.include): yield ExternalURL(source) else: # Apply the include patterns to the relative path below the src directory. # TODO: implement exclude_zero_length to match S3 case. for dirpath, _dirnames, files in os.walk(src): for filename in files: filepath = os.path.join(dirpath, filename) relpath = os.path.relpath(filepath, src) if any(fnmatch.fnmatch(relpath, include_val) for include_val in self.include): yield ExternalURL(filepath)
def get_s3_files(source_url, dest_root, patterns): """Copy remote s3 files that match one of the patterns to a local destination.""" s3_conn = connect_s3() for bucket, root, path in generate_s3_sources(s3_conn, source_url, patterns): source = join_as_s3_url(bucket, root, path) dest_name = path.replace('/', '_') destination = os.path.join(dest_root, dest_name) src_key = get_s3_key(s3_conn, source) if src_key is not None: src_key.get_contents_to_filename(destination) else: print "No key for source " + source
def generate_file_list(self): """Yield each individual path given a source folder and a set of file-matching expressions.""" if self.src.startswith('s3'): # connect lazily as needed: if self.s3_conn is None: self.s3_conn = boto.connect_s3() for _bucket, _root, path in generate_s3_sources(self.s3_conn, self.src, self.include): source = url_path_join(self.src, path) yield ExternalURL(source) else: # Apply the include patterns to the relative path below the src directory. for dirpath, _dirnames, files in os.walk(self.src): for filename in files: filepath = os.path.join(dirpath, filename) relpath = os.path.relpath(filepath, self.src) if any(fnmatch.fnmatch(relpath, include_val) for include_val in self.include): yield ExternalURL(filepath)
def _make_s3_generator(self, bucket_name, root, path_info, patterns): """Generates a list of matching S3 sources using a mock S3 connection.""" s3_conn = MagicMock() s3_bucket = MagicMock() s3_conn.get_bucket = MagicMock(return_value=s3_bucket) target_list = [ self._make_key("{root}/{path}".format(root=root, path=path), size) for path, size in path_info.iteritems() ] s3_bucket.list = MagicMock(return_value=target_list) print[(k.key, k.size) for k in target_list] s3_bucket.name = bucket_name source = "s3://{bucket}/{root}".format(bucket=bucket_name, root=root) generator = s3_util.generate_s3_sources(s3_conn, source, patterns) output = list(generator) return output
def generate_file_list(self): """Yield each individual path given a source folder and a set of file-matching expressions.""" for src in self.src: if src.startswith('s3'): # connect lazily as needed: if self.s3_conn is None: self.s3_conn = boto.connect_s3() for _bucket, _root, path in generate_s3_sources( self.s3_conn, src, self.include): source = url_path_join(src, path) yield ExternalURL(source) else: # Apply the include patterns to the relative path below the src directory. for dirpath, _dirnames, files in os.walk(src): for filename in files: filepath = os.path.join(dirpath, filename) relpath = os.path.relpath(filepath, src) if any( fnmatch.fnmatch(relpath, include_val) for include_val in self.include): yield ExternalURL(filepath)
def download_output_files(self): self.assertEqual(len(list(generate_s3_sources(self.s3_client.s3, self.test_out))), len(self.output_files)) self.temporary_dir = tempfile.mkdtemp() self.addCleanup(shutil.rmtree, self.temporary_dir) self.downloaded_outputs = os.path.join(self.temporary_dir, 'output') os.makedirs(self.downloaded_outputs) for output_file in self.output_files: local_file_name = self.generate_file_name(output_file) remote_url = url_path_join(self.test_out, output_file['course_id'], "events", local_file_name + '.gz') downloaded_output_path = get_file_from_key(self.s3_client, remote_url, self.downloaded_outputs) if downloaded_output_path is None: self.fail('Unable to find expected output file {0}'.format(remote_url)) decompressed_file_name = downloaded_output_path[:-len('.gz')] output_file['downloaded_path'] = decompressed_file_name fs.decompress_file(downloaded_output_path, decompressed_file_name)
def list_s3_files(source_url, patterns): """List remote s3 files that match one of the patterns.""" s3_conn = connect_s3() for _bucket, _root, path in generate_s3_sources(s3_conn, source_url, patterns): print path
s3_key.size = size return s3_key def _make_s3_generator(self, bucket_name, root, path_info, patterns): """Generates a list of matching S3 sources using a mock S3 connection.""" s3_conn = MagicMock() s3_bucket = MagicMock() s3_conn.get_bucket = MagicMock(return_value=s3_bucket) target_list = [self._make_key("{root}/{path}".format(root=root, path=path), size) for path, size in path_info.iteritems()] s3_bucket.list = MagicMock(return_value=target_list) print [(k.key, k.size) for k in target_list] s3_bucket.name = bucket_name source = "s3://{bucket}/{root}".format(bucket=bucket_name, root=root) generator = s3_util.generate_s3_sources(s3_conn, source, patterns) output = list(generator) return output def _run_without_filtering(self, bucket_name, root, path_info): """Runs generator and checks output.""" patterns = ['*'] output = self._make_s3_generator(bucket_name, root, path_info, patterns) self.assertEquals(len(output), len(path_info)) expected = [(bucket_name, root, key) for key in path_info] self.assertEquals(set(output), set(expected)) def test_normal_generate(self): bucket_name = "bucket_name" root = "root1/root2" path_info = {