예제 #1
0
 def generate_file_list(self):
     """Yield each individual path given a source folder and a set of file-matching expressions."""
     for src in self.src:
         if src.startswith('s3'):
             # connect lazily as needed:
             if self.s3_conn is None:
                 self.s3_conn = ScalableS3Client().s3
             for _bucket, _root, path in generate_s3_sources(self.s3_conn, src, self.include, self.include_zero_length):
                 source = url_path_join(src, path)
                 yield ExternalURL(source)
         elif src.startswith('hdfs'):
             for source, size in luigi.contrib.hdfs.listdir(src, recursive=True, include_size=True):
                 if not self.include_zero_length and size == 0:
                     continue
                 elif any(fnmatch.fnmatch(source, include_val) for include_val in self.include):
                     yield ExternalURL(source)
         else:
             # Apply the include patterns to the relative path below the src directory.
             # TODO: implement exclude_zero_length to match S3 case.
             for dirpath, _dirnames, files in os.walk(src):
                 for filename in files:
                     filepath = os.path.join(dirpath, filename)
                     relpath = os.path.relpath(filepath, src)
                     if any(fnmatch.fnmatch(relpath, include_val) for include_val in self.include):
                         yield ExternalURL(filepath)
 def generate_file_list(self):
     """Yield each individual path given a source folder and a set of file-matching expressions."""
     for src in self.src:
         if src.startswith('s3'):
             # connect lazily as needed:
             if self.s3_conn is None:
                 self.s3_conn = ScalableS3Client().s3
             for _bucket, _root, path in generate_s3_sources(self.s3_conn, src, self.include, self.include_zero_length):
                 source = url_path_join(src, path)
                 yield ExternalURL(source)
         elif src.startswith('hdfs'):
             for source, size in luigi.hdfs.listdir(src, recursive=True, include_size=True):
                 if not self.include_zero_length and size == 0:
                     continue
                 elif any(fnmatch.fnmatch(source, include_val) for include_val in self.include):
                     yield ExternalURL(source)
         else:
             # Apply the include patterns to the relative path below the src directory.
             # TODO: implement exclude_zero_length to match S3 case.
             for dirpath, _dirnames, files in os.walk(src):
                 for filename in files:
                     filepath = os.path.join(dirpath, filename)
                     relpath = os.path.relpath(filepath, src)
                     if any(fnmatch.fnmatch(relpath, include_val) for include_val in self.include):
                         yield ExternalURL(filepath)
예제 #3
0
def list_s3_files(source_url, patterns):
    """List remote s3 files that match one of the patterns."""
    s3_conn = connect_s3()
    for bucket, root, path in generate_s3_sources(s3_conn, source_url, patterns):
        source = join_as_s3_url(bucket, root, path)
        src_key = get_s3_key(s3_conn, source)
        print "%10d %s" % (src_key.size if src_key is not None else -1, path)
예제 #4
0
def list_s3_files(source_url, patterns):
    """List remote s3 files that match one of the patterns."""
    s3_conn = connect_s3()
    for bucket, root, path in generate_s3_sources(s3_conn, source_url,
                                                  patterns):
        source = join_as_s3_url(bucket, root, path)
        src_key = get_s3_key(s3_conn, source)
        print "%10d %s" % (src_key.size if src_key is not None else -1, path)
예제 #5
0
def get_s3_files(source_url, dest_root, patterns):
    """Copy remote s3 files that match one of the patterns to a local destination."""
    s3_conn = connect_s3()
    for bucket, root, path in generate_s3_sources(s3_conn, source_url, patterns):
        source = join_as_s3_url(bucket, root, path)
        dest_name = path.replace('/', '_')
        destination = os.path.join(dest_root, dest_name)
        src_key = get_s3_key(s3_conn, source)
        if src_key is not None:
            src_key.get_contents_to_filename(destination)
        else:
            print "No key for source " + source
예제 #6
0
def get_s3_files(source_url, dest_root, patterns):
    """Copy remote s3 files that match one of the patterns to a local destination."""
    s3_conn = connect_s3()
    for bucket, root, path in generate_s3_sources(s3_conn, source_url,
                                                  patterns):
        source = join_as_s3_url(bucket, root, path)
        dest_name = path.replace('/', '_')
        destination = os.path.join(dest_root, dest_name)
        src_key = get_s3_key(s3_conn, source)
        if src_key is not None:
            src_key.get_contents_to_filename(destination)
        else:
            print "No key for source " + source
    def _make_s3_generator(self, bucket_name, root, path_info, patterns):
        """Generates a list of matching S3 sources using a mock S3 connection."""
        s3_conn = MagicMock()
        s3_bucket = MagicMock()
        s3_conn.get_bucket = MagicMock(return_value=s3_bucket)
        target_list = [self._make_key("{root}/{path}".format(root=root, path=path), size)
                       for path, size in path_info.iteritems()]
        s3_bucket.list = MagicMock(return_value=target_list)
        print [(k.key, k.size) for k in target_list]

        s3_bucket.name = bucket_name
        source = "s3://{bucket}/{root}".format(bucket=bucket_name, root=root)
        generator = s3_util.generate_s3_sources(s3_conn, source, patterns)
        output = list(generator)
        return output
예제 #8
0
    def _make_s3_generator(self, bucket_name, root, path_info, patterns):
        """
        Generates a list of matching S3 sources using a mock S3 connection.
        """
        s3_conn = MagicMock()
        s3_bucket = MagicMock()
        s3_conn.get_bucket = MagicMock(return_value=s3_bucket)
        target_list = [self._make_key("{root}/{path}".format(root=root, path=path), size)
                       for path, size in path_info.iteritems()]
        s3_bucket.list = MagicMock(return_value=target_list)
        print([(k.key, k.size) for k in target_list])

        s3_bucket.name = bucket_name
        source = "s3://{bucket}/{root}".format(bucket=bucket_name, root=root)
        generator = s3_util.generate_s3_sources(s3_conn, source, patterns)
        output = list(generator)
        return output
        s3_key.size = size
        return s3_key

    def _make_s3_generator(self, bucket_name, root, path_info, patterns):
        """Generates a list of matching S3 sources using a mock S3 connection."""
        s3_conn = MagicMock()
        s3_bucket = MagicMock()
        s3_conn.get_bucket = MagicMock(return_value=s3_bucket)
        target_list = [self._make_key("{root}/{path}".format(root=root, path=path), size)
                       for path, size in path_info.iteritems()]
        s3_bucket.list = MagicMock(return_value=target_list)
        print [(k.key, k.size) for k in target_list]

        s3_bucket.name = bucket_name
        source = "s3://{bucket}/{root}".format(bucket=bucket_name, root=root)
        generator = s3_util.generate_s3_sources(s3_conn, source, patterns)
        output = list(generator)
        return output

    def _run_without_filtering(self, bucket_name, root, path_info):
        """Runs generator and checks output."""
        patterns = ['*']
        output = self._make_s3_generator(bucket_name, root, path_info, patterns)
        self.assertEquals(len(output), len(path_info))
        expected = [(bucket_name, root, key) for key in path_info]
        self.assertEquals(set(output), set(expected))

    def test_normal_generate(self):
        bucket_name = "bucket_name"
        root = "root1/root2"
        path_info = {