def build_hdfs_client():
    # pylint: disable=global-statement
    global _hdfs_client
    _hdfs_client = AutoConfigClient()
    # To warm up the connection to the namenode
    # Otherwise it may take 3+ minutes at the first time
    _hdfs_client.df()
예제 #2
0
class HdfsFileManager(FileManagerBase):
    """A wrapper of snakebite client."""

    def can_handle(self, path):
        return path.startswith('hdfs://')

    def __init__(self):
        self._client = AutoConfigClient()

    def ls(self, path: str, recursive=False) -> List[File]:
        files = []
        for file in self._client.ls([path], recurse=recursive):
            if file['file_type'] == 'f':
                files.append(File(
                    path=file['path'],
                    size=file['length']))
        return files

    def move(self, source: str, destination: str) -> bool:
        return len(list(self._client.rename([source], destination))) > 0

    def remove(self, path: str) -> bool:
        return len(list(self._client.delete([path]))) > 0

    def copy(self, source: str, destination: str) -> bool:
        # TODO
        raise NotImplementedError()

    def mkdir(self, path: str) -> bool:
        return next(self._client.mkdir([path], create_parent=True))\
            .get('result')
예제 #3
0
 def run(self):
     log.info('initiating snakebite hdfs client')
     try:
         client = AutoConfigClient()
     except krbV.Krb5Error as _:  # pylint: disable=no-member
         if self.verbose:
             print('', file=sys.stderr)
         print(_, file=sys.stderr)
     start_time = time.time()
     dir_count = 0
     file_count = 0
     repl1_count = 0
     for path in self.path_list:
         try:
             result_list = client.ls([path],
                                     recurse=True,
                                     include_toplevel=True,
                                     include_children=True)
             for result in result_list:
                 if self.verbose and (dir_count + file_count) % 100 == 0:
                     print('.', file=sys.stderr, end='')
                 if result['block_replication'] == 0:
                     dir_count += 1
                     continue
                 file_count += 1
                 if result['block_replication'] == 1:
                     file_path = result['path']
                     repl1_count += 1
                     if self.verbose:
                         print('', file=sys.stderr)
                     print(file_path)
                     if self.replication_factor:
                         log.info('setting replication factor to %s on %s',
                                  self.replication_factor, file_path)
                         # returns a generator so must evaluate in order to actually execute
                         # otherwise you find there is no effect on the replication factor
                         for _ in client.setrep([file_path],
                                                self.replication_factor,
                                                recurse=False):
                             if 'result' not in _:
                                 print(
                                     'WARNING: result field not found in setrep result: {}'
                                     .format(_),
                                     file=sys.stderr)
                                 continue
                             if not _['result']:
                                 print(
                                     'WARNING: failed to setrep: {}'.format(
                                         _))
         except (snakebite.errors.FileNotFoundException,
                 snakebite.errors.RequestError) as _:
             if self.verbose:
                 print('', file=sys.stderr)
             print(_, file=sys.stderr)
     if self.verbose:
         print('', file=sys.stderr)
     secs = int(time.time() - start_time)
     print('\nCompleted in {} secs\n'.format(secs), file=sys.stderr)
     print('{} files with replication factor 1 out of {} files in {} dirs'\
           .format(repl1_count, file_count, dir_count), file=sys.stderr)
def mv(src, dest, overwrite=False):
    """
    src (str) : Source path on HDFS
    dest (str) : Destination path on HDFS
    overwrite (boolean) : Overwrite dest if exists
    """
    client = AutoConfigClient()

    list(client.rename2(src, dest, overwrite))
def mkdir(hdfs_path, create_parent=False, mode=0755):
    """
    paths (list of strings) : Paths to create
    create_parent (boolean) : Also create the parent directories
    mode (int) : Mode the directory should be created with
    Returns:
    String mkdir result as json
    """
    client = AutoConfigClient()

    return list(client.mkdir([hdfs_path], create_parent, mode))
def rm(hdfs_path, recurse=False, force=False):
    """
    hdfs_path (str or list of strings) : hdfs files to delete
    recurse (boolean) : recursively delete the folder
    force (boolean) : force deletion (non-interactive) 
    Returns:
    String mkdir result as json
    """
    client = AutoConfigClient()

    return list(client.delete([hdfs_path], recurse))
def main():
    hadoop_conf_dir = "/media/d2/code-sky/dockers/hadoop/etc/hadoop"
    os.environ['HADOOP_CONF_DIR'] = hadoop_conf_dir

    file_dict = {}

    cli = AutoConfigClient()
    target_hdfs_path = "/"

    for element in cli.ls([target_hdfs_path]):
        print("Result: " + str(element))
def ls(hdfs_path, recurse=False, include_toplevel=True, include_children=False):
    """
    Parameters:
    paths (list) : Paths to list
    recurse (boolean) : Recursive listing
    include_toplevel (boolean) : Include the given path in the listing. If the path is a file, include_toplevel is always True.
    include_children (boolean) : Include child nodes in the listing.
    Returns:
    (list) path listings with attributes
    """
    client = AutoConfigClient()

    path_info = list(client.ls([hdfs_path], recurse, include_toplevel, include_children))

    return LsObject(path_info)
예제 #9
0
 def get_bite(self):
     """
     If Luigi has forked, we have a different PID, and need to reconnect.
     """
     config = hdfs_config.hdfs()
     if self.pid != os.getpid() or not self._bite:
         client_kwargs = dict(
             filter(
                 lambda k_v: k_v[1] is not None and k_v[1] != '',
                 six.iteritems({
                     'hadoop_version': config.client_version,
                     'effective_user': config.effective_user,
                 })))
         if config.snakebite_autoconfig:
             """
             This is fully backwards compatible with the vanilla Client and can be used for a non HA cluster as well.
             This client tries to read ``${HADOOP_PATH}/conf/hdfs-site.xml`` to get the address of the namenode.
             The behaviour is the same as Client.
             """
             from snakebite.client import AutoConfigClient
             self._bite = AutoConfigClient(**client_kwargs)
         else:
             from snakebite.client import Client
             self._bite = Client(config.namenode_host, config.namenode_port,
                                 **client_kwargs)
     return self._bite
예제 #10
0
파일: hdfs.py 프로젝트: gdtm86/luigi
 def get_bite(self):
     """
     If Luigi has forked, we have a different PID, and need to reconnect.
     """
     if self.pid != os.getpid() or not self._bite:
         client_kwargs = dict(
             filter(
                 lambda (k, v): v is not None and v != '', {
                     'hadoop_version':
                     self.config.getint("hdfs", "client_version", None),
                     'effective_user':
                     self.config.get("hdfs", "effective_user", None)
                 }.iteritems()))
         if self.config.getboolean("hdfs", "snakebite_autoconfig", False):
             """
             This is fully backwards compatible with the vanilla Client and can be used for a non HA cluster as well.
             This client tries to read ``${HADOOP_PATH}/conf/hdfs-site.xml`` to get the address of the namenode.
             The behaviour is the same as Client.
             """
             from snakebite.client import AutoConfigClient
             self._bite = AutoConfigClient(**client_kwargs)
         else:
             from snakebite.client import Client
             self._bite = Client(
                 self.config.get("hdfs", "namenode_host"),
                 self.config.getint("hdfs", "namenode_port"),
                 **client_kwargs)
     return self._bite
예제 #11
0
파일: hdfs.py 프로젝트: szkielet/luigi
 def get_bite(self):
     """
     If Luigi has forked, we have a different PID, and need to reconnect.
     """
     if self.pid != os.getpid() or not self._bite:
         autoconfig_enabled = self.config.getboolean(
             "hdfs", "snakebite_autoconfig", False)
         if autoconfig_enabled is True:
             """
             This is fully backwards compatible with the vanilla Client and can be used for a non HA cluster as well.
             This client tries to read ``${HADOOP_PATH}/conf/hdfs-site.xml`` to get the address of the namenode.
             The behaviour is the same as Client.
             """
             from snakebite.client import AutoConfigClient
             self._bite = AutoConfigClient()
         else:
             from snakebite.client import Client
             try:
                 ver = self.config.getint("hdfs", "client_version")
                 if ver is None:
                     raise RuntimeError()
                 self._bite = Client(
                     self.config.get("hdfs", "namenode_host"),
                     self.config.getint("hdfs", "namenode_port"),
                     hadoop_version=ver)
             except:
                 self._bite = Client(
                     self.config.get("hdfs", "namenode_host"),
                     self.config.getint("hdfs", "namenode_port"))
     return self._bite
예제 #12
0
파일: hdfs_hook.py 프로젝트: yunhen/airflow
    def get_conn(self):
        '''
        Returns a snakebite HDFSClient object.
        '''
        connections = self.get_connections(self.hdfs_conn_id)

        use_sasl = False
        if configuration.get('core', 'security') == 'kerberos':
            use_sasl = True

        client = None

        ''' When using HAClient, proxy_user must be the same, so is ok to always take the first '''
        effective_user = self.proxy_user or connections[0].login
        if len(connections) == 1:
            autoconfig = connections[0].extra_dejson.get('autoconfig', False)
            if autoconfig:
                client = AutoConfigClient(effective_user=effective_user, use_sasl=use_sasl)
            else:
                client = Client(connections[0].host, connections[0].port,
                                effective_user=effective_user, use_sasl=use_sasl)
        elif len(connections) > 1:
            nn = [Namenode(conn.host, conn.port) for conn in connections]
            client = HAClient(nn, effective_user=effective_user, use_sasl=use_sasl)
        else:
            raise HDFSHookException("conn_id doesn't exist in the repository")
        
        return client
예제 #13
0
 def test_autoconfig_client_trash_false(self, environ_get):
     environ_get.return_value = False
     HDFSConfig.core_try_paths = (
         self.get_config_path('ha-core-site.xml'), )
     HDFSConfig.hdfs_try_paths = (
         self.get_config_path('ha-noport-hdfs-site.xml'), )
     client = AutoConfigClient()
     self.assertFalse(client.use_trash)
예제 #14
0
class HdfsFileManager(FileManagerBase):
    """A wrapper of snakebite client."""
    def can_handle(self, path):
        return path.startswith('hdfs://')

    def __init__(self):
        self._client = AutoConfigClient()

    def ls(self, path: str, recursive=False) -> List[str]:
        files = []
        for file in self._client.ls([path], recurse=recursive):
            if file['file_type'] == 'f':
                files.append(file['path'])
        return files

    def move(self, source: str, destination: str) -> bool:
        return len(list(self._client.rename([source], destination))) > 0

    def remove(self, path: str) -> bool:
        return len(list(self._client.delete([path]))) > 0
예제 #15
0
class HDFSClient(object):
    def __init__(self):
        check_output("hadoop")
        self.fs = AutoConfigClient()

    def homedir(self):
        return "/user/%s/" % getuser()

    def exists(self, path):
        try:
            return self.fs.test(path)
        except Exception:
            return False
예제 #16
0
파일: hdfs.py 프로젝트: folly3/airflow-1
    def get_conn(self) -> Any:
        """
        Returns a snakebite HDFSClient object.
        """
        # When using HAClient, proxy_user must be the same, so is ok to always
        # take the first.
        effective_user = self.proxy_user
        autoconfig = self.autoconfig
        use_sasl = conf.get('core', 'security') == 'kerberos'

        try:
            connections = self.get_connections(self.hdfs_conn_id)

            if not effective_user:
                effective_user = connections[0].login
            if not autoconfig:
                autoconfig = connections[0].extra_dejson.get(
                    'autoconfig', False)
            hdfs_namenode_principal = connections[0].extra_dejson.get(
                'hdfs_namenode_principal')
        except AirflowException:
            if not autoconfig:
                raise

        if autoconfig:
            # will read config info from $HADOOP_HOME conf files
            client = AutoConfigClient(effective_user=effective_user,
                                      use_sasl=use_sasl)
        elif len(connections) == 1:
            client = Client(
                connections[0].host,
                connections[0].port,
                effective_user=effective_user,
                use_sasl=use_sasl,
                hdfs_namenode_principal=hdfs_namenode_principal,
            )
        elif len(connections) > 1:
            name_node = [
                Namenode(conn.host, conn.port) for conn in connections
            ]
            client = HAClient(
                name_node,
                effective_user=effective_user,
                use_sasl=use_sasl,
                hdfs_namenode_principal=hdfs_namenode_principal,
            )
        else:
            raise HDFSHookException("conn_id doesn't exist in the repository "
                                    "and autoconfig is not specified")

        return client
예제 #17
0
import urllib2, os, json, pytz, sys
import datetime, calendar, pprint
import argparse
import snakebite
import subprocess
from snakebite.client import AutoConfigClient
client = AutoConfigClient()

# download.py
# modified version of /home/zsb739/code/libs/ripe-measurement-downloader/experiment_launcher/download.py
# This script downloads data from ripe atlas and stores it in the hdfs


def parse_args():
    parser = argparse.ArgumentParser(
        description='Download daily RIPE data for the provided '
        'measurement ID number')
    parser.add_argument(
        'measurement',
        type=int,
        nargs="+",
        help="The integer identification number for the desired "
        "measurement")
    return parser.parse_args()


def days(start, stop=None):
    if stop == None:
        curr_time = datetime.datetime.utcnow()
        stop_time = datetime.datetime(curr_time.year,
                                      curr_time.month,
예제 #18
0
 def __init__(self):
     self._client = AutoConfigClient()
예제 #19
0
 def __init__(self):
     check_output("hadoop")
     self.fs = AutoConfigClient()
예제 #20
0
        active_namenode = which_active_namenode.communicate()[0].rstrip('\n').rsplit(':')[0]
        break
    else:
        logger.info(node + " is the standby node")
        continue

# bail out if the current node we're running on is not the active namenode.
if active_namenode != socket.getfqdn():
    logger.info("active node " + active_namenode + " is not the current host, so bailing out.")
    exit(-1)
else:
    logger.info("active node is " + active_namenode)



client = AutoConfigClient()

client.use_trash=False

donotdelete_whitelist = [
    # don't remove hadoop-mapred, this kills running jobs
    re.compile("hadoop-mapred"),

    # let's explicitly match hbase.
    re.compile("^/hbase/?"),

    # don't nuke this; hbase uses it for bulk loading.
    re.compile("^/tmp/hbase-staging/?"),

    # let's try to make sure we're not matching against a top-level path
    re.compile("^/[-_.a-zA-Z0-9]+/?$"),
parser.add_argument('-t', '--title', default='Cutflow Efficiency')
parser.add_argument(
    '-x',
    '--NoX',
    action='store_true',
    help='This argument suppresses showing plots via X-forwarding')
parser.add_argument('-o',
                    '--NoOutput',
                    action='store_true',
                    help='This argument suppresses the output of PDF plots')
args = parser.parse_args()

df_list = []
file_list = []

fs = AutoConfigClient()

HT_eff_tot = []
MHT_eff_tot = []
BDP_eff_tot = []
NBJet_eff_tot = []
NJet_eff_tot = []
NVeto_eff_tot = []
M_sq = []
M_lsp = []

for f in fs.text([args.file + '/*/ROOTCuts_output/ROOTCuts.txt']):
    df = pd.read_csv(StringIO(f), delimiter=r'\s+')
    df_HT = df.loc[(df['HT'] > 1200.)]
    df_MHT = df.loc[(df['MHT'] > 200.)]
    df_NBJet = df.loc[(df['NBJet'] > 1)]
        logger.info(node + " is the active node")
        which_active_namenode = subprocess.Popen(['hdfs','getconf','-confKey','dfs.namenode.rpc-address.'+hdfs_cluster+'.'+node], stdout=subprocess.PIPE)
        active_namenode = which_active_namenode.communicate()[0].rstrip('\n').rsplit(':')[0]
        break
    else:
        logger.info(node + " is the standby node")
        continue


if active_namenode != socket.getfqdn():
    logger.info("active node " + active_namenode + " is not the current host, so bailing out.")
    exit(-1)
else:
    logger.info("active node is " + active_namenode)

client = AutoConfigClient()


logger.info("Getting user list from /etc/passwd and ldap")

# get a sorted user list from the passwd directory (file+ldap)
user_list = sorted(pwd.getpwall(), key=lambda tup: tup[0])

for user in user_list:
    username = user.pw_name
    userdir = "/user/" + username
    if user.pw_uid <= 500:
        continue
    if user.pw_uid >= 65534:
        continue
    if client.test(userdir, exists=True):
예제 #23
0
from snakebite.client import AutoConfigClient
import pandas as pd

fs = AutoConfigClient()


def ls(paths=['/'], recursive=False):
    if not isinstance(paths, list) and not isinstance(paths, tuple):
        paths = [paths]
    data = [
        dict(size=p['length'],
             path=p['path'],
             ftype=p['file_type'],
             date=p['modification_time']) for p in fs.ls(paths, recursive)
    ]
    df = pd.DataFrame(data)
    df = df.reindex(['size', 'path', 'ftype', 'date'], axis=1)
    df['size'] = df['size'].astype(int)
    df['date'] = pd.to_datetime(df['date'], unit='ms')
    # ignore current uploads
    df = df[~df.path.str.endswith('.upload')]
    return df