示例#1
0
 def get_fs(self):
     return hdfs3.HDFileSystem(
         host=self.host, port=self.port, pars={
             'input.localread.default.buffersize': '1',
             'input.read.default.verify': '0',
         }
     )
    def store_output_model(self):
        """
          Store tensorflow model stored in local model path to output file path.
        """
        filenames = glob.glob(self.local_model_path + '/*')

        path_list = self.output_file_path.split(os.path.sep)
        if (path_list[0].lower() == 'hdfs:'):
            master, port = path_list[2].split(':')
            hdfs = hdfs3.HDFileSystem(master, port=int(port), user='******')
            output_path = '/' + os.path.join(*path_list[3:])
            print('local_model_path: {a}'.format(a=self.local_model_path))
            print('output_path: {a}'.format(a=self.output_file_path))
            if (hdfs.exists(output_path)):
                hdfs.rm(output_path)
            for file in filenames:
                hdfs.mkdir(output_path)
                if(os.path.isdir(file)):
                    path, filename = os.path.split(file)
                    hdfs.mkdir(output_path + '/' + filename)
                    filesIn2ndLevelFolder = glob.glob(file + '/*')
                    for fileIn2ndLevelFolder in filesIn2ndLevelFolder:
                        path, filenameIn2ndLevelFolder = os.path.split(fileIn2ndLevelFolder)
                        hdfs.put(fileIn2ndLevelFolder, output_path + '/' + filename + '/' + filenameIn2ndLevelFolder,
                                 block_size=1048576)
                else:
                    path, filename = os.path.split(file)
                    hdfs.put(file, output_path + '/' + filename, block_size=1048576)
        else:
            print("local_model_path: ", self.local_model_path)
            print("output_file_path: ", self.output_file_path)
            shutil.copytree(self.local_model_path, self.output_file_path)
示例#3
0
    def configure(self,
                  location,
                  host='localhost',
                  port=9000,
                  user=None,
                  ticket_cache=None,
                  token=None,
                  pars=None,
                  connect=True,
                  **kwargs):
        """Configure the store backend."""
        self.storage = hdfs3.HDFileSystem(host=host,
                                          port=port,
                                          user=user,
                                          ticket_cache=ticket_cache,
                                          token=token,
                                          pars=pars,
                                          connect=connect)
        if location.startswith('/'):
            location = location[1:]
        self.cachedir = os.path.join(location, 'joblib')
        self.storage.mkdir(self.cachedir)

        # attach required methods using monkey patching trick.
        self.open_object = self.storage.open
        self.object_exists = self.storage.exists
        self.mv = self.storage.mv

        # computation results can be stored compressed for faster I/O
        self.compress = (False
                         if 'compress' not in kwargs else kwargs['compress'])

        # Memory map mode is not supported
        self.mmap_mode = None
示例#4
0
def get_hdfs_connect_handle():
    fs = hdfs3.HDFileSystem(host=dare_settings.DARE_LOCATION,
                            port=dare_settings.DARE_PORT,
                            user=dare_settings.DARE_USER)
    # If dir is already in HDFS, this command does not raise error
    fs.mkdir(dare_settings.DARE_ROOT_PATHNAME)
    return fs
示例#5
0
def setup_cluster(config):
    if 'scheduler.ip' not in config:
        scheduler_ip = socket.gethostbyname(socket.gethostname())
    else:
        scheduler_ip = config['scheduler.ip']
    cluster = LocalCluster(n_workers=0,
                           ip=scheduler_ip,
                           port=config['scheduler.port'],
                           diagnostics_port=config['scheduler.bokeh_port'])

    if hdfs3 is not None:
        hdfs = hdfs3.HDFileSystem(host=config.get('hdfs.host'),
                                  port=config.get('hdfs.port'))
    else:
        hdfs = None

    knit = Knit(hdfs=hdfs,
                hdfs_home=config.get('hdfs.home'),
                rm=config.get('yarn.host'),
                rm_port=config.get('yarn.port'))

    command = ('$PYTHON_BIN $CONDA_PREFIX/bin/dask-worker '
               '--nprocs={nprocs:d} '
               '--nthreads={nthreads:d} '
               '--memory-limit={memory_limit:d} '
               '{scheduler_address} '
               '> /tmp/worker-log.out '
               '2> /tmp/worker-log.err').format(
                    nprocs=config['worker.processes'],
                    nthreads=config['worker.threads_per_process'],
                    memory_limit=int(config['worker.memory'] * 1e6),
                    scheduler_address=cluster.scheduler.address)

    app_id = knit.start(command,
                        env=config['cluster.env'],
                        num_containers=config['cluster.count'],
                        virtual_cores=config['worker.cpus'],
                        memory=config['worker.memory'],
                        queue=config['yarn.queue'],
                        app_name='dask',
                        checks=False)

    # Add a few missing fields to config before writing to disk
    config2 = config.copy()
    # The ip is optional, the port may be chosen dynamically
    config2['scheduler.ip'] = cluster.scheduler.ip
    config2['scheduler.port'] = cluster.scheduler.port
    # Fill in optional parameters with auto-detected versions
    config2['yarn.host'] = knit.conf['rm']
    config2['yarn.port'] = knit.conf['rm_port']
    config2['hdfs.home'] = knit.hdfs_home
    # Add in runtime information like app_id and daemon pid
    config2['application.id'] = app_id
    config2['application.pid'] = os.getpid()

    return cluster, knit, config2
示例#6
0
 def __init__(self,
              namenode='localhost',
              namenode_port=8020,
              dest_dtype=None,
              replication=3):
     self.namenode = namenode
     self.namenode_port = namenode_port
     self.hdfs = hdfs3.HDFileSystem(namenode, port=namenode_port)
     self.dest_dtype = dest_dtype
     self.replication = replication
def get_fs():
    import hdfs3

    return hdfs3.HDFileSystem('localhost',
                              port=8020,
                              pars={
                                  'input.localread.default.buffersize': str(1),
                                  'dfs.client.read.shortcircuit': '1',
                                  'input.read.default.verify': '0'
                              })
示例#8
0
    def get_conn(self) -> hdfs3.core.HDFileSystem:

        effective_user = self.effective_user

        connection = self.get_connections(self.hdfs_conn_id)[0]

        if not self.effective_user:
            effective_user = connection.login

        return hdfs3.HDFileSystem(host=connection.host, port=connection.port,
                                  user=effective_user)
 def _read_data_file_from_hdfs(self, input_file_path, max_row=None):
     num_rows_to_read = max_row  # if max_row is None, it will read all files
     path_list = input_file_path.split(os.path.sep)
     master, port = path_list[2].split(':')
     hdfs = hdfs3.HDFileSystem(master, port=int(port), user=path_list[4])
     input_file_path = '/' + os.path.join(*path_list[3:])
     with hdfs.open(input_file_path) as f:
         data = pd.read_csv(f, nrows=num_rows_to_read, header=None)
     num_rows = data.shape[0]
     if max_row is not None and num_rows >= max_row:
         data = data.iloc[:max_row]
     return data
示例#10
0
 def get_fs(self):
     # TODO: maybe this needs to be a context manager, too, so we can do:
     # with ds.get_fs() as fs:
     #   with fs.open("...") as f:
     #       f.read()
     return hdfs3.HDFileSystem(host=self.host,
                               port=self.port,
                               pars={
                                   'input.localread.default.buffersize':
                                   '1',
                                   'input.read.default.verify': '0',
                               })
示例#11
0
 def get_fs(self):
     # TODO: maybe this needs to be a context manager, too, so we can do:
     # with reader.get_fs() as fs:
     #   with fs.open("...") as f:
     #       f.read()
     return hdfs3.HDFileSystem(
         host=self._host, port=self._port, pars={
             'input.localread.default.buffersize': '1',
             'input.read.default.verify': '0',
             'dfs.domain.socket.path': '/run/user/1000/hdfs-short-circuit.socket',
         }
     )
示例#12
0
文件: test_dask.py 项目: quartox/knit
def clear():
    c = CondaCreator()
    try:
        yield
    finally:
        shutil.rmtree(c.conda_envs)
        try:
            k = Knit()
            import hdfs3
            hdfs = hdfs3.HDFileSystem()
            hdfs.rm(k.hdfs_home, recursive=True)
        except:
            pass
示例#13
0
def hdfs(request):
    if request.param == 'hdfs3':
        hdfs = hdfs3.HDFileSystem(host='localhost', port=8020)
    else:
        hdfs = pyarrow.hdfs.connect(host='localhost', port=8020)

    if hdfs.exists(basedir):
        hdfs.rm(basedir, recursive=True)
    hdfs.mkdir(basedir)

    with dask.config.set(hdfs_driver=request.param):
        yield hdfs

    if hdfs.exists(basedir):
        hdfs.rm(basedir, recursive=True)
示例#14
0
    def __init__(self, namenode, namenode_port, replication):
        """
        Move data from a local data source to a HDFS filesystem

        Parameters
        ----------
        namenode : str
            hostname of the HDFS namenode
        namenode_port : int
            port of the HDFS namenode (default 8020)
        replication : int
            number of replicas (default 3)
        """
        self.namenode = namenode
        self.namenode_port = namenode_port
        self.replication = replication
        self.hdfs = hdfs3.HDFileSystem(namenode, port=namenode_port)
示例#15
0
def main():

    module = AnsibleModule(
        argument_spec=dict(
            namenode_host=dict(required=True, type='str'),
            namenode_port=dict(required=False, default=8020, type='int'),
            effective_user=dict(required=False, default=None, type='str'),
            state=dict(choices=['file', 'directory', 'touchz', 'absent'],
                       default=None),
            path=dict(aliases=['dest', 'name'], required=True, type='path'),
            mode=dict(required=False, default=None, type='raw'),
            owner=dict(required=False, default=None, type='str'),
            group=dict(required=False, default=None, type='str'),
            original_basename=dict(
                required=False),  # Internal use only, for recursive ops
            recurse=dict(default=False, type='bool'),
            diff_peek=dict(
                default=None
            ),  # Internal use only, for internal checks in the action plugins
            validate=dict(
                required=False,
                default=None),  # Internal use only, for template and copy
            src=dict(required=False, default=None, type='path'),
        ),
        supports_check_mode=True)

    # Verify that the HDFS client library is available
    if not HAS_HDFS3:
        module.fail_json(msg="Failed to import required python module: hdfs3",
                         details=str(HAS_HDFS3_ERROR))

    # Initialise HDFS client
    try:
        params = module.params
        hdfs_client = hdfs3.HDFileSystem(host=params['namenode_host'],
                                         port=params['namenode_port'],
                                         user=params['effective_user'])
        run(module, hdfs_client)
        hdfs_client.disconnect()
    except ConnectionError:
        ex = get_exception()
        module.fail_json(
            msg='Unable to init HDFS client for %s:%s: %s' %
            (params['namenode_port'], params['effective_user'], str(ex)))
示例#16
0
def read_data_file_from_hdfs(input_file_path, max_row=None):

    # input_file_path = 'hdfs://csle1:9000/user/leeyh_etri_re_kr/dataset/input/trainset.csv'
    num_rows_to_read = max_row  # if max_row is None, it will read all files
    # path, filename = os.path.split(input_file_path)
    # print(path, filename)
    path_list = input_file_path.split(os.path.sep)
    print(path_list[0], path_list[2], path_list[4])
    master, port = path_list[2].split(':')
    print(master, port)
    hdfs = hdfs3.HDFileSystem(master, port=int(port), user=path_list[4])
    input_file_path = '/' + os.path.join(*path_list[3:])
    with hdfs.open(input_file_path) as f:
        data = pd.read_csv(f, nrows=num_rows_to_read, header=None)

    num_rows = data.shape[0]
    if max_row is not None and num_rows >= max_row:
        data = data.iloc[:max_row]
    return data
示例#17
0
文件: core.py 项目: quartox/knit
    def hdfs(self):
        """ An instance of HDFileSystem
        
        Useful for checking on the contents of the staging directory.
        Will be automatically generated using this instance's configuration,
        but can instead directly set ``self._hdfs`` if necessary.

        Note: if the namenode/port is not defined in the conf, will not attempt
        a connection, since it can take a while trying to connect to
        localhost:8020.
        """
        if self._hdfs is None:
            try:
                import hdfs3
                par2 = self.conf.copy()
                par2['host'] = par2.pop('nn')
                par2['port'] = par2.pop('nn_port')
                del par2['replication_factor']
                del par2['rm_port']
                del par2['rm_port_https']
                self._hdfs = hdfs3.HDFileSystem(pars=par2)
            except:
                self._hdfs = False
        return self._hdfs
示例#18
0
# Documentación de acceso a HDFS desde Python3
# https://readthedocs.org/projects/hdfs3/downloads/pdf/latest/
import hdfs3
from collections import defaultdict, Counter

#Conexión a HDFS
# revisar la configuración del docker-compose.yml
# este es el puerto rpc del namenode de hadoop
# por defecto 8020 en la versión 2.7
# pero puede ser otro
hdfs = hdfs3.HDFileSystem('localhost', port=8020)
"""
HDFileSystem([host, port, connect, . . . ]) Connection to an HDFS namenode
HDFileSystem.cat(path) Return contents of file
HDFileSystem.chmod(path, mode) Change access control of given path
HDFileSystem.chown(path, owner, group) Change owner/group
HDFileSystem.df() Used/free disc space on the HDFS system
HDFileSystem.du(path[, total, deep]) Returns file sizes on a path.
HDFileSystem.exists(path) Is there an entry at path?
HDFileSystem.get(hdfs_path, local_path[, . . . ]) Copy HDFS file to local
HDFileSystem.getmerge(path, filename[, . . . ]) Concat all files in path (a directory) to local output file
HDFileSystem.get_block_locations(path[,. . . ]) Fetch physical locations of blocks
HDFileSystem.glob(path) Get list of paths mathing glob-like pattern (i.e., with “*”s).
HDFileSystem.info(path) File information (as a dict)
HDFileSystem.ls(path[, detail]) List files at path
HDFileSystem.mkdir(path) Make directory at path
HDFileSystem.mv(path1, path2) Move file at path1 to path2
HDFileSystem.open(path[, mode, replication, . . . ])
Open a file for reading or writing
HDFileSystem.put(filename, path[, chunk, . . . ]) Copy local file to path in HDFS
HDFileSystem.read_block(fn, offset, length) Read a block of bytes from an HDFS file
示例#19
0
import random
import time
import pyarrow as pa
import hdfs3
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

DATA_SIZE = 200 * (1 << 20)
data = 'a' * DATA_SIZE

pa.

hdfs = pa.HdfsClient('localhost', 20500, 'wesm')
hdfscpp = pa.HdfsClient('localhost', 20500, 'wesm', driver='libhdfs3')
hdfs3_fs = hdfs3.HDFileSystem('localhost', port=20500, user='******')

hdfs.delete(path)
path = '/tmp/test-data-file-1'
with hdfs.open(path, 'wb') as f:
    f.write(data)

def read_chunk(f, size):
    # do a random seek
    f.seek(random.randint(0, size))
    return f.read(size)

def ensemble_average(runner, niter=10):
    start = time.clock()
    gc.disable()
    data_chunks = []
示例#20
0
 def __init__(self, **kwargs):
     self.fs = hdfs3.HDFileSystem(**kwargs)
def test_promotion(segment_manager_server):
    hdfs = hdfs3.HDFileSystem(settings['HDFS_HOST'], settings['HDFS_PORT'])

    hdfs.rm(settings['HDFS_PATH'])
    hdfs.mkdir(settings['HDFS_PATH'])

    result = segment_manager_server.get('/promote')
    assert result.status == '405 METHOD NOT ALLOWED'

    # provision a test segment for write
    result = segment_manager_server.post('/provision',
                                         content_type='application/json',
                                         data=ujson.dumps(
                                             {'segment': 'test_promotion'}))
    assert result.status_code == 200
    assert result.mimetype == 'application/json'
    result_bytes = b''.join(result.response)
    result_dict = ujson.loads(result_bytes)
    assert result_dict['write_url'].endswith(':6222/?segment=test_promotion')
    write_url = result_dict['write_url']

    # write something into the db
    sql = ('create table foo (bar varchar(100));\n'
           'insert into foo (bar) values ("testing segment promotion");\n')
    response = requests.post(write_url, sql)
    assert response.status_code == 200

    # shouldn't be anything in hdfs yet...
    expected_remote_path = os.path.join(settings['HDFS_PATH'], 'test_promot',
                                        'test_promotion.sqlite')
    with pytest.raises(FileNotFoundError):
        hdfs.ls(expected_remote_path, detail=True)

    # now write to the segment and promote it to HDFS
    before = time.time()
    time.sleep(1.5)
    result = segment_manager_server.post('/promote',
                                         content_type='application/json',
                                         data=ujson.dumps(
                                             {'segment': 'test_promotion'}))
    assert result.status_code == 200
    assert result.mimetype == 'application/json'
    result_bytes = b''.join(result.response)
    result_dict = ujson.loads(result_bytes)
    assert result_dict == {'remote_path': expected_remote_path}

    # make sure it doesn't think the segment is under promotion
    rethinker = doublethink.Rethinker(servers=settings['RETHINKDB_HOSTS'],
                                      db='trough_configuration')
    query = rethinker.table('lock').get('write:lock:test_promotion')
    result = query.run()
    assert not result.get('under_promotion')

    # let's see if it's hdfs
    listing_after_promotion = hdfs.ls(expected_remote_path, detail=True)
    assert len(listing_after_promotion) == 1
    assert listing_after_promotion[0]['last_mod'] > before

    # grab the file from hdfs and check the content
    # n.b. copy created by sqlitebck may have different size, sha1 etc from orig
    size = None
    with tempfile.TemporaryDirectory() as tmpdir:
        local_copy = os.path.join(tmpdir, 'test_promotion.sqlite')
        hdfs.get(expected_remote_path, local_copy)
        conn = sqlite3.connect(local_copy)
        cur = conn.execute('select * from foo')
        assert cur.fetchall() == [('testing segment promotion', )]
        conn.close()
        size = os.path.getsize(local_copy)

    # test promotion when there is an assignment in rethinkdb
    rethinker.table('assignment').insert({
        'assigned_on': doublethink.utcnow(),
        'bytes': size,
        'hash_ring': 0,
        'id': 'localhost:test_promotion',
        'node': 'localhost',
        'remote_path': expected_remote_path,
        'segment': 'test_promotion'
    }).run()

    # promote it to HDFS
    before = time.time()
    time.sleep(1.5)
    result = segment_manager_server.post('/promote',
                                         content_type='application/json',
                                         data=ujson.dumps(
                                             {'segment': 'test_promotion'}))
    assert result.status_code == 200
    assert result.mimetype == 'application/json'
    result_bytes = b''.join(result.response)
    result_dict = ujson.loads(result_bytes)
    assert result_dict == {'remote_path': expected_remote_path}

    # make sure it doesn't think the segment is under promotion
    rethinker = doublethink.Rethinker(servers=settings['RETHINKDB_HOSTS'],
                                      db='trough_configuration')
    query = rethinker.table('lock').get('write:lock:test_promotion')
    result = query.run()
    assert not result.get('under_promotion')

    # let's see if it's hdfs
    listing_after_promotion = hdfs.ls(expected_remote_path, detail=True)
    assert len(listing_after_promotion) == 1
    assert listing_after_promotion[0]['last_mod'] > before

    # pretend the segment is under promotion
    rethinker.table('lock')\
            .get('write:lock:test_promotion')\
            .update({'under_promotion': True}).run()
    assert rethinker.table('lock')\
            .get('write:lock:test_promotion').run()\
            .get('under_promotion')
    with pytest.raises(Exception):
        result = segment_manager_server.post(
            '/promote',
            content_type='application/json',
            data=ujson.dumps({'segment': 'test_promotion'}))
示例#22
0
文件: aac.py 项目: NielsDegrande/o3
def ingest_avro(schema_path: str, avro_path: str, target_table: str, host: str,
                thrift_port: int, hdfs_port: int, username: str):
    """Ingest Avro data into Hive."""

    fs = hdfs3.HDFileSystem(host=host, port=hdfs_port, user=username)

    schema_basename = os.path.basename(schema_path)
    hdfs_schema_path = os.path.join(f'/user/{username}', schema_basename)
    full_hdfs_schema_path = f'hdfs://{host}:{hdfs_port}{hdfs_schema_path}'
    if fs.exists(hdfs_schema_path):
        fs.rm(hdfs_schema_path)
    fs.put(schema_path, hdfs_schema_path, replication=1)

    avro_basename = os.path.basename(avro_path)
    hdfs_avro_path = os.path.join(f'/user/{username}', avro_basename)

    if not fs.exists(hdfs_avro_path):
        fs.put(avro_path, hdfs_avro_path, replication=1)

    conn = hive.Connection(host=host, port=thrift_port, username=username,
                           configuration={
                               'hive.exec.dynamic.partition.mode': 'nonstrict'
                           })
    cursor = conn.cursor()

    input_fmt = 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
    output_fmt = 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
    row_format = 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
    temp_table_name = avro_basename.replace('.', '_').replace('-', '_')

    create_temp_table_stmt = f"""
        CREATE TABLE IF NOT EXISTS {temp_table_name}
        ROW FORMAT SERDE '{row_format}'
        STORED AS INPUTFORMAT '{input_fmt}'
        OUTPUTFORMAT '{output_fmt}'
        TBLPROPERTIES ('avro.schema.url'='{full_hdfs_schema_path}')
    """
    print(f'--- create_temp_table_stmt ---\n{create_temp_table_stmt}')
    cursor.execute(create_temp_table_stmt)

    select_temp_row_stmt = f"""
        SELECT * FROM {temp_table_name} LIMIT 1
    """
    print(f'--- select_temp_row_stmt---\n{select_temp_row_stmt}')
    cursor.execute(select_temp_row_stmt)

    if cursor.fetchone() is None:
        load_data_stmt = f"""
            LOAD DATA INPATH '{hdfs_avro_path}'
            INTO TABLE {temp_table_name}
        """
        print(f'--- load_data_stmt ---\n{load_data_stmt}')
        cursor.execute(load_data_stmt)

    create_target_table_stmt = f"""
        CREATE EXTERNAL TABLE IF NOT EXISTS {target_table}
        PARTITIONED BY (ds STRING, h STRING, en STRING)
        ROW FORMAT SERDE '{row_format}'
        STORED AS INPUTFORMAT '{input_fmt}'
        OUTPUTFORMAT '{output_fmt}'
        TBLPROPERTIES ('avro.schema.url'='{full_hdfs_schema_path}')
    """
    print(f'--- create_target_table_stmt ---\n{create_target_table_stmt}')
    cursor.execute(create_target_table_stmt)

    insert_data_stmt = f"""
        INSERT INTO {target_table} PARTITION (ds, h, en)
        SELECT
            *,
            datestamp AS ds,
            substr(server_date, 12, 2) AS h,
            event_name AS en
        FROM
            {temp_table_name}
    """

    print(f'--- insert_data_stmt ---\n{insert_data_stmt}')
    cursor.execute(insert_data_stmt)

    drop_temp_table_stmt = f"""
        DROP TABLE {temp_table_name}
    """

    print(f'--- drop_temp_table_stmt ---\n{drop_temp_table_stmt}')
    cursor.execute(drop_temp_table_stmt)
def test_delete_segment(segment_manager_server):
    hdfs = hdfs3.HDFileSystem(settings['HDFS_HOST'], settings['HDFS_PORT'])
    rethinker = doublethink.Rethinker(servers=settings['RETHINKDB_HOSTS'],
                                      db='trough_configuration')

    # initially, segment doesn't exist
    result = segment_manager_server.delete('/segment/test_delete_segment')
    assert result.status_code == 404

    # provision segment
    result = segment_manager_server.post(
        '/provision',
        content_type='application/json',
        data=ujson.dumps({'segment': 'test_delete_segment'}))
    assert result.status_code == 200
    assert result.mimetype == 'application/json'
    result_bytes = b''.join(result.response)
    result_dict = ujson.loads(result_bytes)
    assert result_dict['write_url'].endswith(
        ':6222/?segment=test_delete_segment')
    write_url = result_dict['write_url']

    # write something into the db
    sql = ('create table foo (bar varchar(100));\n'
           'insert into foo (bar) values ("testing segment deletion");\n')
    response = requests.post(write_url, sql)
    assert response.status_code == 200

    # check that local file exists
    local_path = os.path.join(settings['LOCAL_DATA'],
                              'test_delete_segment.sqlite')
    assert os.path.exists(local_path)

    # check that attempted delete while under write returns 400
    result = segment_manager_server.delete('/segment/test_delete_segment')
    assert result.status_code == 400

    # shouldn't be anything in hdfs yet
    expected_remote_path = os.path.join(settings['HDFS_PATH'],
                                        'test_delete_segm',
                                        'test_delete_segment.sqlite')
    with pytest.raises(FileNotFoundError):
        hdfs.ls(expected_remote_path, detail=True)

    # promote segment to hdfs
    result = segment_manager_server.post(
        '/promote',
        content_type='application/json',
        data=ujson.dumps({'segment': 'test_delete_segment'}))
    assert result.status_code == 200
    assert result.mimetype == 'application/json'
    result_bytes = b''.join(result.response)
    result_dict = ujson.loads(result_bytes)
    assert result_dict == {'remote_path': expected_remote_path}

    # let's see if it's hdfs
    hdfs_ls = hdfs.ls(expected_remote_path, detail=True)
    assert len(hdfs_ls) == 1

    # add an assignment (so we can check it is deleted successfully)
    rethinker.table('assignment').insert({
        'assigned_on':
        doublethink.utcnow(),
        'bytes':
        os.path.getsize(local_path),
        'hash_ring':
        0,
        'id':
        '%s:test_delete_segment' % socket.gethostname(),
        'node':
        socket.gethostname(),
        'remote_path':
        expected_remote_path,
        'segment':
        'test_delete_segment'
    }).run()

    # check that service entries, assignment exist
    assert rethinker.table('services')\
            .get('trough-read:%s:test_delete_segment' % socket.gethostname())\
            .run()
    assert rethinker.table('services')\
            .get('trough-write:%s:test_delete_segment' % socket.gethostname())\
            .run()
    assert rethinker.table('assignment')\
            .get('%s:test_delete_segment' % socket.gethostname()).run()

    # check that attempted delete while under write returns 400
    result = segment_manager_server.delete('/segment/test_delete_segment')
    assert result.status_code == 400

    # delete the write lock
    assert rethinker.table('lock')\
            .get('write:lock:test_delete_segment').delete().run() == {
                    'deleted': 1, 'errors': 0, 'inserted': 0,
                    'replaced': 0 , 'skipped': 0 , 'unchanged': 0, }

    # delete the segment
    result = segment_manager_server.delete('/segment/test_delete_segment')
    assert result.status_code == 204

    # check that service entries and assignment are gone
    assert not rethinker.table('services')\
            .get('trough-read:%s:test_delete_segment' % socket.gethostname())\
            .run()
    assert not rethinker.table('services')\
            .get('trough-write:%s:test_delete_segment' % socket.gethostname())\
            .run()
    assert not rethinker.table('assignment')\
            .get('%s:test_delete_segment' % socket.gethostname()).run()

    # check that local file is gone
    assert not os.path.exists(local_path)

    # check that file is gone from hdfs
    with pytest.raises(FileNotFoundError):
        hdfs_ls = hdfs.ls(expected_remote_path, detail=True)
def train_or_predict():
    # FLAGS.input  = 'hdfs://csle1:9000/user/leeyh_etri_re_kr/dataset/input/trainset.csv'
    # FLAGS.output = 'hdfs://csle1:9000/user/leeyh_etri_re_kr/output/models/rnn/0020'
    # FLAGS.model  = '/home/csle/testCodes/models/rnn/0020'

    # FLAGS.input  = 'file:///home/csle/testCodes/input/trainset.csv'
    # FLAGS.output = 'file:///home/csle/testCodes/models/rnn/0020'
    # FLAGS.model  = 'file:///home/csle/testCodes/models/rnn/0020'

    # FLAGS.input  = '/home/csle/testCodes/input/trainset.csv'
    # FLAGS.output = '/home/csle/testCodes/models/rnn/0020'
    # FLAGS.model  = '/home/csle/testCodes/models/rnn/0020'

    (root_path, sep, input_data_path) = FLAGS.input.rpartition('/')
    (checkpoint_dir, sep, model_path) = FLAGS.model.rpartition('/')
    (train_accuracy_dir, sep,
     train_accuracy_path) = FLAGS.output.rpartition('/')

    if FLAGS.isTrain:
        datasets, num_links = preproc.load_processed_data(
            FLAGS.isTrain, FLAGS.num_train, FLAGS.num_validation,
            FLAGS.num_test, FLAGS.input, FLAGS.num_steps, FLAGS.elapse_steps)
    else:
        datasets, num_links = preproc.load_processed_data(
            FLAGS.isTrain, FLAGS.num_predict, 0, 0, FLAGS.input,
            FLAGS.num_steps, FLAGS.elapse_steps)
    if (FLAGS.num_links > num_links):
        FLAGS.num_links = num_links
    if (FLAGS.num_outputs > FLAGS.num_links):
        FLAGS.num_outputs = FLAGS.num_links

    if 'sess' in globals():
        sess.close()

    tf.reset_default_graph()
    sess = tf.InteractiveSession()

    x = tf.placeholder(tf.float32,
                       [FLAGS.batch_size, FLAGS.num_steps, FLAGS.num_links],
                       name='input_placeholder')
    y = tf.placeholder(tf.float32, [FLAGS.batch_size, FLAGS.num_outputs],
                       name='labels_placeholder')
    loss_weights = tf.placeholder(tf.float32, [FLAGS.batch_size])
    keep_prob = tf.placeholder(tf.float32)

    cell = tf.contrib.rnn.BasicLSTMCell(FLAGS.state_size,
                                        state_is_tuple=True,
                                        reuse=tf.get_variable_scope().reuse)

    init_state = cell.zero_state(FLAGS.batch_size, tf.float32)
    rnn_outputs, final_state = tf.nn.dynamic_rnn(cell,
                                                 x,
                                                 initial_state=init_state)
    rnn_last_outputs = rnn_outputs[:, FLAGS.num_steps - 1, :]

    # Output Layers
    with tf.variable_scope('fully_connected_0'):
        W = tf.get_variable('W', [FLAGS.state_size, FLAGS.state_size])
        b = tf.get_variable('b', [FLAGS.state_size],
                            initializer=tf.constant_initializer(0.0))

        fc_outputs_0 = tf.nn.elu(tf.matmul(rnn_last_outputs, W) + b)
        fc_dropout_0 = tf.nn.dropout(fc_outputs_0, keep_prob)

    with tf.variable_scope('fully_connected_1'):
        W = tf.get_variable('W', [FLAGS.state_size, FLAGS.state_size])
        b = tf.get_variable('b', [FLAGS.state_size],
                            initializer=tf.constant_initializer(0.0))

        fc_outputs_1 = tf.nn.elu(tf.matmul(fc_dropout_0, W) + b)
        fc_dropout_1 = tf.nn.dropout(fc_outputs_1, keep_prob)

    with tf.variable_scope('fully_connected_2'):
        W = tf.get_variable('W', [FLAGS.state_size, FLAGS.num_outputs])
        b = tf.get_variable('b', [FLAGS.num_outputs],
                            initializer=tf.constant_initializer(0.0))

        # final outputs, predictions
        outputs = tf.nn.elu(tf.matmul(fc_dropout_1, W) + b)

    unscaled_output = 100 * (outputs + 0.5)

    mse = tf.square(y - outputs)
    mse = tf.reduce_mean(mse, reduction_indices=[1])
    total_loss = tf.reduce_sum(mse * loss_weights)

    train_step = tf.train.AdamOptimizer(
        FLAGS.learning_rate).minimize(total_loss)

    saver = tf.train.Saver()
    sess.run(tf.initialize_all_variables())

    if FLAGS.isTrain:
        print('Tensorflow train job started !')
        num_examples = FLAGS.num_train - FLAGS.num_steps - FLAGS.elapse_steps + 1
        pbar = PrgBar(FLAGS.num_epoch, num_examples)
        p = os.path.join(FLAGS.model, FLAGS.model_filename)
        local_model_path = p[:p.rindex(os.path.sep)]
        local_path_list = local_model_path.split(os.path.sep)
        if (local_path_list[0].lower() == 'file:'):
            local_model_path = '/' + os.path.join(*local_path_list[3:])
        os.makedirs(local_model_path, exist_ok=True)
        for idx in range(FLAGS.num_epoch):
            last_batch = False
            while (last_batch is False):
                X, Y, last_batch, lw, num_samples = datasets.train.next_batch(
                    FLAGS.batch_size, FLAGS.num_links, FLAGS.num_outputs)
                loss_, last_outputs, est, _ = \
                    sess.run([total_loss, rnn_last_outputs, unscaled_output, train_step], \
                             feed_dict={x: X, y: Y, loss_weights: lw, keep_prob: FLAGS.dropout})
                pbar.log(num_samples, loss_)
                sys.stdout.flush()

            if (idx + 1) % FLAGS.checkpoint_steps == 0:
                saver.save(sess, p)
        print(pbar.losses)
        if (idx + 1) % FLAGS.checkpoint_steps != 0:
            saver.save(sess, p)

        model_path = FLAGS.model
        path_list = model_path.split(os.path.sep)
        if (path_list[0].lower() == 'file:'):
            model_path = '/' + os.path.join(*path_list[3:])

        filenames = glob.glob(model_path +
                              '/*')  # /home/csle/testCodes/models/rnn/0019/*

        output_file_path = FLAGS.output
        path_list = output_file_path.split(os.path.sep)
        if (path_list[0].lower() == 'hdfs:'):
            # 'hdfs://csle1:9000/user/leeyh_etri_re_kr/output/models/rnn/0019'
            master, port = path_list[2].split(':')
            hdfs = hdfs3.HDFileSystem(master, port=int(port), user='******')
            output_path = '/' + os.path.join(*path_list[3:])
            if (hdfs.exists(output_path)):
                hdfs.rm(output_path)
            for file in filenames:
                hdfs.mkdir(output_path)
                path, filename = os.path.split(file)
                hdfs.put(file,
                         output_path + '/' + filename,
                         block_size=1048576)
                print(hdfs.ls(output_path))

        p = os.path.join(train_accuracy_dir, train_accuracy_path)
        local_p = os.path.join(local_model_path, train_accuracy_path)
        accuracy_file = open(local_p, 'w')
        accuracy_file.write("%.7f" % pbar.getAverageLoss())
        accuracy_file.close()
        hdfs.put(local_p,
                 output_path + '/' + train_accuracy_path,
                 block_size=1048576)

    else:
        # Here's where you're restoring the variables w and b.
        # Note that the graph is exactly as it was when the variables were
        # saved in a prior training run.
        print('Tensorflow prediction job started !')
        ckpt = tf.train.get_checkpoint_state(FLAGS.model)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
            print('Restored!', end="\r")
            predictions = evaluate_network(sess, datasets.train, x, y,
                                           total_loss, unscaled_output,
                                           loss_weights, keep_prob)

            #             if not os.path.exists(os.path.join(FLAGS.output_predict_dir, FLAGS.output_predict_path)):
            p = os.path.join(FLAGS.output_predict_dir,
                             FLAGS.output_predict_path)
            os.makedirs(p[:p.rindex(os.path.sep)], exist_ok=True)
            predict_file = open(p, 'w')
            for predicts in predictions:
                predicts[predicts < 0] = 0.0
                strings = ["%.2f" % predict for predict in predicts]
                predict_file.write(",".join(strings))
                predict_file.write("\n")
                print(",".join(strings), end="\r")
                sys.stdout.flush()
            predict_file.close()
        else:
            print('No checkpoint found!')