Python hdfs示例，pydoop.hdfs.hdfs Python示例

示例#1

0

显示文件

文件： test_hdfs.py 项目： ZEMUSHKA/pydoop

 def tearDown(self):
   fs = hdfs.hdfs("", 0)
   fs.delete(self.local_wd)
   fs.close()
   fs = hdfs.hdfs("default", 0)
   fs.delete(self.hdfs_wd)
   fs.close()

示例#2

0

显示文件

文件： hadoopWriter.py 项目： davedwards/beautiful-data

def write(writeFlag):
    if (writeFlag == True):
        # instantiate hadoop
        hdfs.hdfs()
        
        targetPath = config.targetPath;
        targetDirectory = config.targetDirectory;
        sourceFile = config.sourceFile;
        
        print("Target Path: " + targetPath);
        print("Target Directory: " + targetDirectory);
        print("Source Path: " + sourceFile);
        
        dumpFile = open(sourceFile, "r");
        fullText = dumpFile.read();
        dumpFile.close();
        
        # write to hadoop
        #hdfs.mkdir(targetDirectory)
        hdfs.dump(fullText, targetPath)
#hdfs.cp(sourceFile, targetPath)

#print (hdfs.ls("test4"))
#files = hdfs.ls("test4")

# read from hadoop
#hdfs.get("test4/hello.txt", "/tmp/hello.txt")
#with open("/tmp/hello.txt") as f:
#	print f.read()

#print(hdfs.ls("test", "hduser1"))
#text = hdfs.load("test/hello.txt")
#print text

示例#3

0

显示文件

 def tearDown(self):
     fs = hdfs.hdfs("", 0)
     fs.delete(self.local_wd)
     fs.close()
     fs = hdfs.hdfs("default", 0)
     fs.delete(self.hdfs_wd)
     fs.close()

示例#4

0

显示文件

 def capacity(self):
     fs = hdfs.hdfs("", 0)
     self.assertRaises(RuntimeError, fs.capacity)
     fs.close()
     if not hdfs.default_is_local():
         fs = hdfs.hdfs("default", 0)
         cap = fs.capacity()
         self.assertGreaterEqual(cap, 0)

示例#5

0

显示文件

文件： test_hdfs.py 项目： kikkomep/pydoop

 def capacity(self):
     fs = hdfs.hdfs("", 0)
     self.assertRaises(RuntimeError, fs.capacity)
     fs.close()
     if not hdfs.default_is_local():
         fs = hdfs.hdfs("default", 0)
         cap = fs.capacity()
         self.assertGreaterEqual(cap, 0)

示例#6

0

显示文件

文件： test_hdfs_fs.py 项目： jkahn/pydoop-code

 def cache(self):
     orig_fs = hdfs.hdfs(*self.hp_cases[0])
     for host, port in self.hp_cases[1:]:
         fs = hdfs.hdfs(host, port)
         self.assertTrue(fs.fs is orig_fs.fs)
         fs.close()
         self.assertFalse(orig_fs.closed)
     orig_fs.close()
     self.assertTrue(orig_fs.closed)

示例#7

0

显示文件

文件： test_hdfs_fs.py 项目： onlynone/pydoop

 def cache(self):
   hdfs.hdfs._CACHE.clear()
   orig_fs = hdfs.hdfs(*self.hp_cases[0])
   for host, port in self.hp_cases[1:]:
     fs = hdfs.hdfs(host, port)
     self.assertTrue(fs.fs is orig_fs.fs)
     fs.close()
     self.assertFalse(orig_fs.closed)
   orig_fs.close()
   self.assertTrue(orig_fs.closed)

示例#8

0

显示文件

文件： hadoop_run.py 项目： CosteaPaul/bcbb

def _hdfs_filesystem():
    """Retrieve references to the local and HDFS file system.

    Need to be able to specify host/port. For now, works off defaults.
    """
    fs = hdfs("default", 0)
    lfs = hdfs("", 0)
    try:
        yield fs, lfs
    finally:
        fs.close()
        lfs.close()

示例#9

0

显示文件

文件： test_hdfs_fs.py 项目： CynthiaYiqingHuang/pydoop

 def cache(self):
     for (h1, p1), (h2, p2) in product(self.hp_cases, repeat=2):
         hdfs.hdfs._CACHE.clear()
         hdfs.hdfs._ALIASES = {"host": {}, "port": {}, "user": {}}  # FIXME
         with hdfs.hdfs(h1, p1) as fs1:
             with hdfs.hdfs(h2, p2) as fs2:
                 print ' * %r vs %r' % ((h1, p1), (h2, p2))
                 self.assertTrue(fs2.fs is fs1.fs)
             for fs in fs1, fs2:
                 self.assertFalse(fs.closed)
         for fs in fs1, fs2:
             self.assertTrue(fs.closed)

示例#10

0

显示文件

 def cache(self):
     for (h1, p1), (h2, p2) in product(self.hp_cases, repeat=2):
         hdfs.hdfs._CACHE.clear()
         hdfs.hdfs._ALIASES = {"host": {}, "port": {}, "user": {}}  # FIXME
         with hdfs.hdfs(h1, p1) as fs1:
             with hdfs.hdfs(h2, p2) as fs2:
                 print ' * %r vs %r' % ((h1, p1), (h2, p2))
                 self.assertTrue(fs2.fs is fs1.fs)
             for fs in fs1, fs2:
                 self.assertFalse(fs.closed)
         for fs in fs1, fs2:
             self.assertTrue(fs.closed)

示例#11

0

显示文件

def copyFileToHDFSFolder(localpath, hdfspath):
    """
    Copies a file from a local or HDFS to an HDFS location
    :param localpath: path to local file
    :param hdfspath: path to target file on HDFS
    :return: None
    """
    if localpath.startswith('file:/'):
        lf = H.hdfs("", 0)
    else:
        lf = H.hdfs()
    h = H.hdfs()
    lf.copy(localpath, h, hdfspath)

示例#12

0

显示文件

文件： hadoopReader.py 项目： davedwards/beautiful-data

def read(readFlag):
    print(readFlag);
    if (readFlag == True):
        targetFile = config.targetFile.strip()
        targetDirectory = config.targetDirectory.strip()
        targetPath = config.targetPath
        
        print(targetPath)
        
        # instantiate hadoop
        hdfs.hdfs()
        
        # read from hadoop
        fileToRead = hdfs.open(targetPath)
        print(fileToRead.read())

示例#13

0

显示文件

def save_checkpoint(path, session=None):
    if session is None:
        session = tf.get_default_session()
    if session is None:
        raise RuntimeError("no session specified and no current session")
    saver = tf.train.Saver()
    wd = tempfile.mkdtemp(prefix="pydeep_")
    sub_d = hdfs.path.splitext(hdfs.path.basename(path))[0]
    abs_d = os.path.join(wd, sub_d)
    os.makedirs(abs_d)
    saver.save(session, os.path.join(abs_d, Model.CHECKPOINT_NAME))
    zip_fn = "%s.zip" % abs_d
    shutil.make_archive(*zip_fn.rsplit(".", 1), root_dir=abs_d)
    with hdfs.hdfs() as fs, hdfs.hdfs("", 0) as local_fs:
        local_fs.copy(zip_fn, fs, path)

示例#14

0

显示文件

 def stat_on_local(self):
     wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR)
     p_ = os.path.join(wd_, make_random_str())
     if hdfs.default_is_local():
         wd, p = wd_, p_
         host = "default"
     else:
         wd, p = ('file:%s' % _ for _ in (wd_, p_))
         host = ""
     fs = hdfs.hdfs(host, 0)
     with fs.open_file(p_, 'w') as fo:
         fo.write(make_random_str())
     info = fs.get_path_info(p_)
     fs.close()
     s = hdfs.path.stat(p)
     os_s = os.stat(p_)
     for n in dir(s):
         if n.startswith('st_'):
             try:
                 exp_v = getattr(os_s, n)
             except AttributeError:
                 try:
                     exp_v = info[self.NMAP[n]]
                 except KeyError:
                     continue
                 self.assertEqual(getattr(s, n), exp_v)
     self.__check_extra_args(s, info)
     self.__check_wrapper_funcs(p)
     hdfs.rmr(wd)

示例#15

0

显示文件

文件： seqal_run.py 项目： pinno/seal

    def run(self):
        if self.options is None:
            raise RuntimeError("You must call parse_cmd_line before run")

        if self.logger.isEnabledFor(logging.DEBUG):
            self.logger.debug("Running Seqal")
            self.logger.debug("Properties:\n%s", "\n".join( sorted([ "%s = %s" % (str(k), str(v)) for k,v in self.properties.iteritems() ]) ))
        self.logger.info("Input: %s; Output: %s; reference: %s", self.options.input, self.options.output, self.options.reference)

        try:
            self.hdfs = phdfs.hdfs('default', 0)
            self.__validate()

            self.remote_bin_name = tempfile.mktemp(prefix='seqal_bin.', suffix=str(random.random()), dir='')
            try:
                with self.hdfs.open_file(self.remote_bin_name, 'w') as script:
                    self.__write_pipes_script(script)

                full_name = self.hdfs.get_path_info(self.remote_bin_name)['name']

                return seal_utilities.run_pipes(full_name, self.options.input, self.options.output,
                    properties=self.properties, args_list=self.left_over_args)
            finally:
                try:
                    self.hdfs.delete(self.remote_bin_name) # delete the temporary pipes script from HDFS
                    self.logger.debug("pipes script %s deleted", self.remote_bin_name)
                except:
                    self.logger.error("Error deleting the temporary pipes script %s from HDFS", self.remote_bin_name)
                    ## don't re-raise the exception.  We're on our way out
        finally:
            if self.hdfs:
                tmp = self.hdfs
                self.hdfs = None
                tmp.close()
                self.logger.debug("HDFS closed")

示例#16

0

显示文件

文件： test_hdfs_fs.py 项目： onlynone/pydoop

 def connect(self):
   for host, port in self.hp_cases:
     for user in self.u_cases:
       expected_user = user or CURRENT_USER
       fs = hdfs.hdfs(host, port, user=user)
       self.assertEqual(fs.user, expected_user)
       fs.close()

示例#17

0

显示文件

    def build_map(self, top_dir):
        """\
        For each subdir (corresponding to an image class), build the full
        list of (filename, offset) pair where each bottleneck dump can be
        retrieved.

        {'dandelion': [
            ('part-m-00000', 0),
            ('part-m-00000', 8192),
            ...
            ('part-m-00003', 163840)
        ],
        'roses': [
            ('part-m-00000', 0),
            ...
        ]}
        """
        m = {}
        basename = hdfs.path.basename
        with hdfs.hdfs() as fs:
            for stat in fs.list_directory(top_dir):
                if stat['kind'] != 'directory':
                    continue
                subd = stat['name']
                positions = []
                for s in fs.list_directory(subd):
                    bname = basename(s["name"])
                    if bname.startswith("_"):
                        continue
                    assert s["size"] % self.record_size == 0
                    for i in range(0, s["size"], self.record_size):
                        positions.append((bname, i))
                m[basename(subd)] = positions
        return m

示例#18

0

显示文件

 def __create_data_file(self):
     host, port, path = split_hdfs_path(self.data_file_name)
     fs = hdfs(host, port)
     f = fs.open_file(path, os.O_WRONLY, 0, 0, 0)
     f.write(self.f.getvalue())
     f.close()
     fs.close()

示例#19

0

显示文件

class StorageHandler:

    hdfsobj = hdfs.hdfs()

    def __init__(self, host, port):
        self.hdfsobj = hdfs.hdfs(host,
                                 port,
                                 user="******",
                                 groups=["vagrant"])

    def pwd(self):
        return self.hdfsobj.working_directory()

    def listDirectory(self, path="/"):
        return self.hdfsobj.list_directory(path)

    def delete(self, path):
        self.hdfsobj.delete(path, False)

    def put(self, source, destination):
        hdfs.put(source, destination)

    def copyFile(self, source, destination):
        self.hdfsobj.copy(source, self.hdfsobj, destination)

    def write(self, path, mod, data):
        with hdfs.open(path, mod) as f:
            f.write(data)

示例#20

0

显示文件

文件： db.py 项目： AshinGau/eventdb

 def __init__(self, tableName, host='192.168.60.64', infoTable='runInfo'):
     self.tableName = tableName
     self.conn = happybase.Connection(host)
     self.table = self.conn.table(infoTable)
     self.eventdb = self.conn.table('HEP:' + tableName)
     self.escape = escape()
     self.fs = hdfs.hdfs(host=host, port=8022, user='******')

示例#21

0

显示文件

文件： test_hdfs_fs.py 项目： kmatzen/pydoop

 def connect(self):
     for host, port in self.hp_cases:
         for user in self.u_cases:
             expected_user = user or CURRENT_USER
             fs = hdfs.hdfs(host, port, user=user)
             self.assertEqual(fs.user, expected_user)
             fs.close()

示例#22

0

显示文件

文件： test_path.py 项目： ilveroluca/pydoop

 def setUp(self):
   if hdfs.default_is_local():
     self.root = "file:"
   else:
     fs = hdfs.hdfs("default", 0)
     self.root = "hdfs://%s:%s" % (fs.host, fs.port)
     fs.close()

示例#23

0

显示文件

文件： script.py 项目： ilveroluca/pydoop

  def __warn_user_if_wd_maybe_unreadable(self, abs_remote_path):
    """
    Check directories above the remote module and issue a warning if
    they are not traversable by all users.

    The reasoning behind this is mainly aimed at set-ups with a centralized
    Hadoop cluster, accessed by all users, and where the Hadoop task tracker
    user is not a superuser; an example may be if you're running a shared
    Hadoop without HDFS (using only a POSIX shared file system).  The task
    tracker correctly changes user to the job requester's user for most
    operations, but not when initializing the distributed cache, so jobs who
    want to place files not accessible by the Hadoop user into dist cache fail.
    """
    host, port, path = hdfs.path.split(abs_remote_path)
    if host == '' and port == 0: # local file system
      host_port = "file:///"
    else:
      # FIXME: this won't work with any scheme other than hdfs:// (e.g., s3)
      host_port = "hdfs://%s:%s/" % (host, port)
    path_pieces = path.strip('/').split(os.path.sep)
    fs = hdfs.hdfs(host, port)
    for i in xrange(0, len(path_pieces)):
      part = os.path.join(host_port, os.path.sep.join(path_pieces[0:i+1]))
      permissions = fs.get_path_info(part)['permissions']
      if permissions & 0111 != 0111:
        self.logger.warning(
          "the remote module %s may not be readable\n" +
          "by the task tracker when initializing the distributed cache.\n" +
          "Permissions on path %s: %s", abs_remote_path, part, oct(permissions))
        break

示例#24

0

显示文件

文件： test_path.py 项目： onlynone/pydoop

 def setUp(self):
   if hdfs.default_is_local():
     self.root = "file:"
   else:
     fs = hdfs.hdfs("default", 0)
     self.root = "hdfs://%s:%s" % (fs.host, fs.port)
     fs.close()

示例#25

0

显示文件

文件： hdfs.py 项目： nihil0/hops-util-py

def get():
    """ Get a handle to pydoop hdfs using the default namenode (specified in hadoop config)

    Returns:
        Pydoop hdfs handle
    """
    return hdfs.hdfs('default', 0, user=project_user())

示例#26

0

显示文件

文件： submit.py 项目： muhammadyaseen/pydoop

    def __warn_user_if_wd_maybe_unreadable(self, abs_remote_path):
        """
        Check directories above the remote module and issue a warning if
        they are not traversable by all users.

        The reasoning behind this is mainly aimed at set-ups with a
        centralized Hadoop cluster, accessed by all users, and where
        the Hadoop task tracker user is not a superuser; an example
        may be if you're running a shared Hadoop without HDFS (using
        only a POSIX shared file system).  The task tracker correctly
        changes user to the job requester's user for most operations,
        but not when initializing the distributed cache, so jobs who
        want to place files not accessible by the Hadoop user into
        dist cache fail.
        """
        host, port, path = hdfs.path.split(abs_remote_path)
        if host == '' and port == 0:  # local file system
            host_port = "file:///"
        else:
            # FIXME: this won't work with any scheme other than
            # hdfs:// (e.g., s3)
            host_port = "hdfs://%s:%s/" % (host, port)
        path_pieces = path.strip('/').split(os.path.sep)
        fs = hdfs.hdfs(host, port)
        for i in range(0, len(path_pieces)):
            part = os.path.join(host_port,
                                os.path.sep.join(path_pieces[0:i + 1]))
            permissions = fs.get_path_info(part)['permissions']
            if permissions & 0o111 != 0o111:
                self.logger.warning(
                    ("remote module %s may not be readable by the task "
                     "tracker when initializing the distributed cache.  "
                     "Permissions on %s: %s"), abs_remote_path, part,
                    oct(permissions))
                break

示例#27

0

显示文件

文件： test_path.py 项目： kikkomep/pydoop

 def stat_on_local(self):
     wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR)
     p_ = os.path.join(wd_, make_random_str())
     if hdfs.default_is_local():
         wd, p = wd_, p_
         host = "default"
     else:
         wd, p = ('file:%s' % _ for _ in (wd_, p_))
         host = ""
     fs = hdfs.hdfs(host, 0)
     with fs.open_file(p_, 'w') as fo:
         fo.write(make_random_str())
     info = fs.get_path_info(p_)
     fs.close()
     s = hdfs.path.stat(p)
     os_s = os.stat(p_)
     for n in dir(s):
         if n.startswith('st_'):
             try:
                 exp_v = getattr(os_s, n)
             except AttributeError:
                 try:
                     exp_v = info[self.NMAP[n]]
                 except KeyError:
                     continue
                 self.assertEqual(getattr(s, n), exp_v)
     self.__check_extra_args(s, info)
     self.__check_wrapper_funcs(p)
     hdfs.rmr(wd)

示例#28

0

显示文件

def load_checkpoint(path, session=None):
    if session is None:
        session = tf.get_default_session()
    if session is None:
        raise RuntimeError("no session specified and no current session")
    wd = tempfile.mkdtemp(prefix="pydeep_")
    zip_fn = os.path.join(wd, hdfs.path.basename(path))
    with hdfs.hdfs() as fs, hdfs.hdfs("", 0) as local_fs:
        fs.copy(path, local_fs, zip_fn)
    unpack_dir = os.path.splitext(zip_fn)[0]
    shutil.unpack_archive(zip_fn, unpack_dir)
    ckpt_path = os.path.join(unpack_dir, Model.CHECKPOINT_NAME)
    metagraph_path = "%s.meta" % ckpt_path
    if not os.path.isfile(metagraph_path):
        raise RuntimeError("checkpoint files not found in %s" % zip_fn)
    saver = tf.train.import_meta_graph(metagraph_path)
    saver.restore(session, ckpt_path)

示例#29

0

显示文件

 def __init__(self, context):
     super(WholeFileReader, self).__init__(context)
     self.logger = LOGGER.getChild("WholeFileReader")
     raw_split = context.get_input_split(raw=True)
     self.isplit = OpaqueInputSplit().read(io.BytesIO(raw_split))
     self.paths = self.isplit.payload
     self.n_paths = len(self.paths)
     self.fs = hdfs.hdfs()

示例#30

0

显示文件

文件： parquet.py 项目： BigUtrecht/BigUtrecht

def clearLocatie():
    """
    Removes the locatie parquet table, if it exists
    :return: None
    """
    h = hdfs()
    if h.exists(LOCATIE):
        h.delete(LOCATIE)

示例#31

0

显示文件

文件： parquet.py 项目： BigUtrecht/BigUtrecht

def clearTelling():
    """
    Removes the telling parquet table, if it exists
    :return: None
    """
    h = hdfs()
    if h.exists(TELLING):
        h.delete(TELLING)

示例#32

0

显示文件

文件： check_results.py 项目： kikkomep/pydoop

def get_res(output_dir):
    fs = hdfs()
    data = []
    for x in fs.list_directory(output_dir):
        if os.path.split(x['path'])[-1].startswith('part-'):
            with fs.open_file(x['path']) as f:
                data.append(f.read())
    all_data = ''.join(data)
    return pts.parse_mr_output(all_data, vtype=int)

示例#33

0

显示文件

文件： check_results.py 项目： muhammadyaseen/pydoop

def get_res(output_dir):
    fs = hdfs()
    data = []
    for x in fs.list_directory(output_dir):
        if os.path.split(x['path'])[-1].startswith('part-'):
            with fs.open_file(x['path'], 'rt') as f:
                data.append(f.read())
    all_data = ''.join(data)
    return pts.parse_mr_output(all_data, vtype=int)

示例#34

0

显示文件

文件： distblast_pipes.py 项目： 16NWallace/bcbb

 def __init__(self, context):
     super(FastaReader, self).__init__()
     self.logger = logging.getLogger(self.__class__.__name__)
     self.isplit = InputSplit(context.getInputSplit())
     self.host, self.port, self.fpath = split_hdfs_path(self.isplit.filename)
     self.fs = hdfs(self.host, self.port)
     self.file = self.fs.open_file(self.fpath, os.O_RDONLY)
     self._iterator = (SeqIO.parse(self.file, "fasta") if
                       self.isplit.offset == 0 else None)

示例#35

0

显示文件

文件： check_results.py 项目： xuande/pydoop

def compute_vc(input_dir):
    fs = hdfs()
    data = []
    for x in fs.list_directory(input_dir):
        with fs.open_file(x['path']) as f:
            data.append(f.read())
    all_data = ''.join(data)
    vowels = re.findall('[AEIOUY]', all_data.upper())
    return Counter(vowels)

示例#36

0

显示文件

文件： check_results.py 项目： kikkomep/pydoop

def compute_vc(input_dir):
    fs = hdfs()
    data = []
    for x in fs.list_directory(input_dir):
        with fs.open_file(x['path']) as f:
            data.append(f.read())
    all_data = ''.join(data)
    vowels = re.findall('[AEIOUY]', all_data.upper())
    return Counter(vowels)

示例#37

0

显示文件

文件： distblast_pipes.py 项目： Pfiver/RNA-Seqlyze

 def __init__(self, context):
     super(FastaReader, self).__init__()
     self.logger = logging.getLogger(self.__class__.__name__)
     self.isplit = InputSplit(context.getInputSplit())
     self.host, self.port, self.fpath = split_hdfs_path(
         self.isplit.filename)
     self.fs = hdfs(self.host, self.port)
     self.file = self.fs.open_file(self.fpath, os.O_RDONLY)
     self._iterator = (SeqIO.parse(self.file, "fasta")
                       if self.isplit.offset == 0 else None)

示例#38

0

显示文件

文件： test_local_fs.py 项目： CynthiaYiqingHuang/pydoop

 def runTest(self):
     current_user = getpass.getuser()
     cwd = os.getcwd()
     os.chdir(tempfile.gettempdir())
     for user in None, current_user, "nobody":
         expected_user = current_user
         fs = hdfs.hdfs("", 0, user=user)
         self.assertEqual(fs.user, expected_user)
         fs.close()
     os.chdir(cwd)

示例#39

0

显示文件

文件： parquet.py 项目： BigUtrecht/BigUtrecht

def clearResults(name=""):
    """
    Clears target result parquet table name, or all result parquet tables if no name is given
    :param name: the target result parquet table name
    :return: None
    """
    p = path.join(RESULT_DIR, name)
    h = hdfs()
    if h.exists(p):
        h.delete(p)

示例#40

0

显示文件

 def runTest(self):
     current_user = getpass.getuser()
     cwd = os.getcwd()
     os.chdir(tempfile.gettempdir())
     for user in None, current_user, "nobody":
         expected_user = current_user
         fs = hdfs.hdfs("", 0, user=user)
         self.assertEqual(fs.user, expected_user)
         fs.close()
     os.chdir(cwd)

示例#41

0

显示文件

def main(directory, topic, byline):
    #get a hdfs object
    myHdfs = hdfs.hdfs()
    myPath = myHdfs.walk(directory)

    # a global variable
    global producer

    # Get a producer object
    producer = KafkaProducer(bootstrap_servers=["node4:6667"],
                             compression_type='gzip',
                             acks=1,
                             retries=2)

    for myfile in myPath:
        #Skip directory recursive
        if myfile["kind"] == "directory":
            logger.debug("ignoring %s" % (myfile))
            continue

        elif myfile["kind"] == "file":
            pass

        else:
            raise Exception, "Unknown kind %s for %s" % (myfile["kind"],
                                                         myfile["name"])

        #Skip name in particoular
        if "_SUCCESS" in myfile["name"] or "_temporary" in myfile["name"]:
            logger.debug("ignoring %s" % (myfile))
            continue

        #Skip 0 size files
        if myfile["size"] == 0:
            logger.debug("ignoring %s" % (myfile))
            continue

        logger.info("Working on %s" % (myfile["name"]))

        #call processChunk if I want to submit chunk
        if byline is False:
            processChunk(myfile, topic)

        else:
            #Otherwise submit line by line
            processLine(myfile, topic)

        #with file open
        logger.info("Completed %s" % (myfile["name"]))

        #sleep some time
        time.sleep(1)

    # for all files in HDFS
    producer.close()

示例#42

0

显示文件

 def get_hosts(self):
     if hdfs.default_is_local():
         # only run on HDFS
         return
     hdfs.dump(self.data, self.hdfs_paths[0], mode="wb")
     fs = hdfs.hdfs("default", 0)
     hs = fs.get_hosts(self.hdfs_paths[0], 0, 10)
     self.assertTrue(len(hs) > 0)
     self.assertRaises(ValueError, fs.get_hosts, self.hdfs_paths[0], -10,
                       10)
     self.assertRaises(ValueError, fs.get_hosts, self.hdfs_paths[0], 0, -10)

示例#43

0

显示文件

文件： treewalk.py 项目： xuande/pydoop

def main():
    fs = hdfs.hdfs()
    try:
        root = "%s/%s" % (fs.working_directory(), TEST_ROOT)
        if not isdir(fs, root):
            sys.exit("%r does not exist" % root)
        print "BS(MB)\tBYTES"
        for k, v in usage_by_bs(fs, root).iteritems():
            print "%.1f\t%d" % (k / float(MB), v)
    finally:
        fs.close()

示例#44

0

显示文件

文件： treewalk.py 项目： CynthiaYiqingHuang/pydoop

def main():
    fs = hdfs.hdfs()
    try:
        root = "%s/%s" % (fs.working_directory(), TEST_ROOT)
        if not isdir(fs, root):
            sys.exit("%r does not exist" % root)
        print "BS(MB)\tBYTES"
        for k, v in usage_by_bs(fs, root).iteritems():
            print "%.1f\t%d" % (k / float(MB), v)
    finally:
        fs.close()

示例#45

0

显示文件

def Get_stock_ticks(code, time_to_market):
    import tushare as ts
    import pandas as pd
    import logging
    import datetime as dt
    import os
    import socket
    import pydoop.hdfs as hdfs
    import shutil

    if time_to_market != 0:
        logger = logging.getLogger("D_stock")
        logger_handler = logging.FileHandler("/tmp/D_stock.log")
        logger_handler.setFormatter(
            logging.Formatter("%(asctime)s -- %(message)s"))
        logger_handler.setLevel(logging.DEBUG)
        logger.setLevel(logging.DEBUG)
        logger.addHandler(logger_handler)
        logger.info(">" * 15 + code + ">" * 15)

        all_days = pd.date_range(start=str(time_to_market),
                                 end=dt.date.today(),
                                 freq="B")
        all_days = [x.date() for x in all_days]
        for day in all_days[::-1]:
            logger.info("Saving " + code + "@" + str(day) + "...")
            while True:
                try:
                    df = ts.get_tick_data(code, date=day)
                except Exception as e:
                    print e
                    continue
                break

            if df.index.size > 3:
                dir_name = "/tmp/ticks/" + str(code)
                if not os.path.exists(dir_name):
                    os.makedirs(dir_name)

                file_name = dir_name + "/" + str(day) + ".csv"
                df.to_csv(file_name)
        """
        Write to HDFS        
        """
        if os.path.exists(dir_name):
            s = hdfs.hdfs(host="spark-1", port=9000)
            if not s.exists("ticks"):
                s.create_directory("ticks")
            hdfs.put(dir_name, "./ticks/")
            shutil.rmtree(dir_name)

        logger.info("<" * 15 + code + "<" * 15)
    return (socket.gethostname(), code)

示例#46

0

显示文件

文件： test_hdfs.py 项目： kikkomep/pydoop

 def get_hosts(self):
     if hdfs.default_is_local():
         # only run on HDFS
         return
     hdfs.dump(self.data, self.hdfs_paths[0])
     fs = hdfs.hdfs("default", 0)
     hs = fs.get_hosts(self.hdfs_paths[0], 0, 10)
     self.assertTrue(len(hs) > 0)
     self.assertRaises(
         ValueError, fs.get_hosts, self.hdfs_paths[0], -10, 10
     )
     self.assertRaises(ValueError, fs.get_hosts, self.hdfs_paths[0], 0, -10)

示例#47

0

显示文件

文件： seqal_run.py 项目： okulev/seal

    def run(self):
        if self.options is None:
            raise RuntimeError("You must call parse_cmd_line before run")

        if self.logger.isEnabledFor(logging.DEBUG):
            self.logger.debug("Running Seqal")
            self.logger.debug(
                "Properties:\n%s", "\n".join(
                    sorted([
                        "%s = %s" % (str(k), str(v))
                        for k, v in self.properties.iteritems()
                    ])))
        self.logger.info("Input: %s; Output: %s; reference: %s",
                         self.options.input, self.options.output,
                         self.options.reference)

        try:
            self.hdfs = phdfs.hdfs('default', 0)
            self.__validate()

            self.remote_bin_name = tempfile.mktemp(prefix='seqal_bin.',
                                                   suffix=str(random.random()),
                                                   dir='')
            try:
                with self.hdfs.open_file(self.remote_bin_name, 'w') as script:
                    self.__write_pipes_script(script)

                full_name = self.hdfs.get_path_info(
                    self.remote_bin_name)['name']

                return hadut.run_pipes(full_name,
                                       self.options.input,
                                       self.options.output,
                                       properties=self.properties,
                                       args_list=self.left_over_args)
            finally:
                try:
                    self.hdfs.delete(
                        self.remote_bin_name
                    )  # delete the temporary pipes script from HDFS
                    self.logger.debug("pipes script %s deleted",
                                      self.remote_bin_name)
                except:
                    self.logger.error(
                        "Error deleting the temporary pipes script %s from HDFS",
                        self.remote_bin_name)
                    ## don't re-raise the exception.  We're on our way out
        finally:
            if self.hdfs:
                tmp = self.hdfs
                self.hdfs = None
                tmp.close()
                self.logger.debug("HDFS closed")

示例#48

0

显示文件

def main(args):
    host, port, out_dir = hdfs.path.split(args.out_dir)
    fs = hdfs.hdfs(host, port)
    fs.create_directory(out_dir)
    join = os.path.join
    for dt, path in get_images(args.in_dir):
        out_path = join(out_dir, f"{dt.strftime(OUT_FMT)}.png")
        if not args.overwrite and fs.exists(out_path):
            continue
        with io.open(path, "rb") as fi:
            with fs.open_file(out_path, "wb") as fo:
                fo.write(fi.read())

示例#49

0

显示文件

文件： common_hdfs_tests.py 项目： kmatzen/pydoop

 def copy(self):
   local_fs = hdfs.hdfs('', 0)
   local_wd = make_wd(local_fs)
   from_path = os.path.join(local_wd, uuid.uuid4().hex)
   content = uuid.uuid4().hex
   with open(from_path, "w") as f:
     f.write(content)
   to_path = self._make_random_file()
   local_fs.copy(from_path, self.fs, to_path)
   local_fs.close()
   with self.fs.open_file(to_path) as f:
     self.assertEqual(f.read(), content)
   shutil.rmtree(local_wd)

示例#50

0

显示文件

文件： kafka-producer.py 项目： bunop/ccc-capstone

def main(directory, topic, byline):
    #get a hdfs object
    myHdfs = hdfs.hdfs()
    myPath = myHdfs.walk(directory)
    
    # a global variable
    global producer 

    # Get a producer object
    producer = KafkaProducer(bootstrap_servers=["node4:6667"], compression_type='gzip', acks=1, retries=2)
    
    for myfile in myPath:
        #Skip directory recursive
        if myfile["kind"] == "directory":
            logger.debug("ignoring %s" %(myfile))
            continue
        
        elif myfile["kind"] == "file":
            pass
        
        else:
            raise Exception, "Unknown kind %s for %s" %(myfile["kind"], myfile["name"])
            
        #Skip name in particoular
        if "_SUCCESS" in myfile["name"] or "_temporary" in myfile["name"]:
            logger.debug("ignoring %s" %(myfile))
            continue
        
        #Skip 0 size files
        if myfile["size"] == 0:
            logger.debug("ignoring %s" %(myfile))
            continue
        
        logger.info("Working on %s" %(myfile["name"]))

        #call processChunk if I want to submit chunk
        if byline is False:
            processChunk(myfile, topic)
            
        else:
            #Otherwise submit line by line
            processLine(myfile, topic)

        #with file open
        logger.info("Completed %s" %(myfile["name"]))
        
        #sleep some time
        time.sleep(1)
                    
    # for all files in HDFS
    producer.close()

示例#51

0

显示文件

文件： D_stock_cloud.py 项目： tek-life/D_stock

def Get_stock_ticks(code, time_to_market):
    import tushare as ts
    import pandas as pd
    import logging
    import datetime as dt
    import os
    import socket
    import pydoop.hdfs as hdfs
    import shutil

    if time_to_market !=0:
	logger = logging.getLogger("D_stock")
	logger_handler=logging.FileHandler("/tmp/D_stock.log")
	logger_handler.setFormatter(logging.Formatter("%(asctime)s -- %(message)s"))
	logger_handler.setLevel(logging.DEBUG)
	logger.setLevel(logging.DEBUG)
	logger.addHandler(logger_handler)
        logger.info(">"*15+code+">"*15)

        all_days=pd.date_range(start=str(time_to_market),end=dt.date.today(),freq="B")
        all_days=[x.date() for x in all_days]
        for day in all_days[::-1]:
            logger.info("Saving "+code+"@"+str(day)+"...")
            while True:
                try:
                    df=ts.get_tick_data(code,date=day)
                except Exception as e:
                    print e
                    continue
                break

            if df.index.size >3:
                dir_name="/tmp/ticks/"+str(code)
                if not os.path.exists(dir_name):
                    os.makedirs(dir_name)

                file_name=dir_name+"/"+str(day)+".csv"
                df.to_csv(file_name)
        """
        Write to HDFS        
        """
        if os.path.exists(dir_name):
            s=hdfs.hdfs(host="spark-1",port=9000)
            if not s.exists("ticks"):
                s.create_directory("ticks")
            hdfs.put(dir_name,"./ticks/")
            shutil.rmtree(dir_name)

        logger.info("<"*15+code+"<"*15)
    return (socket.gethostname(),code)

示例#52

0

显示文件

文件： try_hdfs.py 项目： ilveroluca/pydoop

def main(argv=sys.argv[1:]):
  parser = argparse.ArgumentParser(description=__doc__)
  parser.add_argument("--conf-dir", metavar="HADOOP_CONF_DIR")
  args = parser.parse_args(argv)
  if args.conf_dir:
    os.environ["HADOOP_CONF_DIR"] = os.path.abspath(args.conf_dir)
    hdfs.reset()
  fs = hdfs.hdfs()
  print "--- OPEN ---"
  dump_status(fs)
  print "cwd:", fs.working_directory()
  print
  fs.close()
  print "--- CLOSED ---"
  dump_status(fs)

示例#53

0

显示文件

文件： test_hdfs.py 项目： ZEMUSHKA/pydoop

 def setUp(self):
   wd = tempfile.mkdtemp()
   wd_bn = os.path.basename(wd)
   self.local_wd = "file:%s" % wd
   fs = hdfs.hdfs("default", 0)
   fs.create_directory(wd_bn)
   self.hdfs_wd = fs.get_path_info(wd_bn)["name"]
   fs.close()
   basenames = ["test_path_%d" % i for i in xrange(2)]
   self.local_paths = ["%s/%s" % (self.local_wd, bn) for bn in basenames]
   self.hdfs_paths = ["%s/%s" % (self.hdfs_wd, bn) for bn in basenames]
   self.data = make_random_data(4*BUFSIZE + BUFSIZE/2)
   for path in self.local_paths:
     self.assertTrue(path.startswith("file:"))
   for path in self.hdfs_paths:
     if not hdfs.default_is_local():
       self.assertTrue(path.startswith("hdfs:"))

示例#54

0

显示文件

文件： test_path.py 项目： kikkomep/pydoop

 def stat(self):
     if hdfs.default_is_local():
         return
     bn = '%s%s' % (make_random_str(), UNI_CHR)
     fn = '/user/%s/%s' % (DEFAULT_USER, bn)
     fs = hdfs.hdfs("default", 0)
     p = "hdfs://%s:%s%s" % (fs.host, fs.port, fn)
     with fs.open_file(fn, 'w') as fo:
         fo.write(make_random_str())
     info = fs.get_path_info(fn)
     fs.close()
     s = hdfs.path.stat(p)
     for n1, n2 in self.NMAP.iteritems():
         attr = getattr(s, n1, None)
         self.assertFalse(attr is None)
         self.assertEqual(attr, info[n2])
     self.__check_extra_args(s, info)
     self.__check_wrapper_funcs(p)
     hdfs.rmr(p)

示例#55

0

显示文件

文件： workflow.py 项目： ilveroluca/flink-pipeline

def _clean_up_bcl_output(output_dir):
    """
    Delete prq files with no data
    """
    host, port, _ = phdfs.path.split(output_dir)
    fs = phdfs.hdfs(host, port)
    count = 0
    for item in fs.walk(output_dir):
        if item['kind'] == 'file' and item['name'].endswith('.gz') and item['size'] < 30:
            if not item['name'].startswith('hdfs://'):
                raise RuntimeError("Insanity!  Tring to delete %s!", item['name'])
            fs.delete(item['name'], recursive=False)
            count += 1
    logger.info("Removed %d empty files from bcl output", count)

    undet_path = os.path.join(output_dir, 'Undetermined')
    if phdfs.path.exists(undet_path):
        logger.info("Removing reads from Undetermined dataset %s", undet_path)
        fs.delete(undet_path)

示例#56

0

显示文件

文件： treegen.py 项目： ZEMUSHKA/pydoop

def main(argv):
  
  try:
    depth = int(argv[1])
    span = int(argv[2])
  except IndexError:
    print "Usage: python %s DEPTH SPAN" % argv[0]
    sys.exit(2)

  fs = hdfs.hdfs()
  try:
    root = "%s/%s" % (fs.working_directory(), TEST_ROOT)
    try:
      fs.delete(root)
    except IOError:
      pass
    fs.create_directory(root)
    treegen(fs, root, depth, span)
  finally:
    fs.close()

示例#57

0

显示文件

文件： dist_text_zipper.py 项目： crs4/hadoop-galaxy

    def rename_compressed_files(self, file_table):
        # find the extension
        output_files = hdfs.ls(self.output_path)
        if len(output_files) == 0:
            return

        compressor_extension = self.get_compressor_extension(output_files)
        self.log.debug("compressor extension is %s", compressor_extension)

        hdfs_host, hdfs_port, _ = hdfs.path.split(output_files[0])
        if hdfs_host == '':
            is_local_fs = True
        else:
            is_local_fs = False
            output_hdfs = hdfs.hdfs(hdfs_host, hdfs_port)

        file_table.seek(0)
        for mapid, line in enumerate(file_table.xreadlines()):
            _, _, relative_output_name = line.rstrip('\n').split('\t')
            # we expect the map task ids to be assigned in the same order as the input
            # file list, so we can match the input file to an output file by its position
            # in the input file list.
            hadoop_output = os.path.join(self.output_path, "part-%05d" % mapid) + compressor_extension
            desired_file_name = os.path.join(self.output_path, relative_output_name) + compressor_extension
            if hadoop_output != desired_file_name:
                self.log.debug("renaming %s to %s", hadoop_output, desired_file_name)
                if is_local_fs:
                    # Though we could transparently use hdfs.move for both local fs and hdfs,
                    # using native methods for the local fs should be faster.
                    # os.renames automatically creates necessary parent directories for destination.
                    os.renames(urlparse(hadoop_output).path, urlparse(desired_file_name).path)
                else:
                    # create the output subdirectory, if necessary
                    dirname = os.path.dirname(relative_output_name)
                    if dirname:
                        output_hdfs.create_directory( os.path.join(self.output_path, dirname) )
                    if output_hdfs.exists(desired_file_name):
                        raise RuntimeError("Can't overwrite file in output directory: %s" % desired_file_name)
                    output_hdfs.move(hadoop_output, output_hdfs, desired_file_name)

示例#58

0

显示文件

文件： test_opaque.py 项目： crs4/pydoop

 def setUp(self):
     self.fs = hdfs()
     self.wd = utils.make_wd(self.fs)

示例#59

0

显示文件

文件： mr_blast.py 项目： crs4/vispa

def main(argv):

  parser = make_parser()
  opt, args = parser.parse_args()
  try:
    input_fasta = args[0]
    db_archive = args[1]
  except IndexError:
    parser.print_help()
    sys.exit(2)

  STR_GENERATOR.prefix = os.path.basename(input_fasta)

  logger = logging.getLogger()
  for h in logger.handlers:
    logger.removeHandler(h)
  opt.log_level_str = opt.log_level
  opt.log_level = getattr(logging, opt.log_level)
  kwargs = {'format': LOG_FORMAT,
            'datefmt': LOG_DATEFMT,
            'level': opt.log_level}
  if opt.log_file:
    kwargs['filename'] = opt.log_file
  logging.basicConfig(**kwargs)

  logger.debug("cli args: %r" % (args,))
  logger.debug("cli opts: %s" % opt)

  if opt.mr_dump_file:
    opt.mr_dump_file = open(opt.mr_dump_file, "w")
  else:
    opt.mr_dump_file = sys.stderr
  
  if not opt.blast_db:
    opt.blast_db = os.path.basename(db_archive).split(".", 1)[0]
    logger.info("--blast-db not provided: setting to %r" % opt.blast_db)
  
  os.environ["HADOOP_HOME"] = opt.hadoop_home
  if not opt.hadoop:
    opt.hadoop = os.path.join(opt.hadoop_home, "bin/hadoop")
  if not opt.hadoop_conf_dir:
    opt.hadoop_conf_dir = os.path.join(opt.hadoop_home, "conf")
  os.environ["HADOOP_CONF_DIR"] = opt.hadoop_conf_dir
  hdfs.reset()

  fs = hdfs.hdfs()
  logger.debug("hdfs params: host=%s, port=%d" % (fs.host, fs.port))
  lfs = hdfs.hdfs("", 0)
  runner = Runner(fs, lfs, logger)

  try:
    db_archive_hdfs = runner.upload_archive(db_archive)
    blast_input_hdfs = runner.run_f2t(input_fasta, opt)
    blast_output_hdfs = runner.run_blast(blast_input_hdfs, db_archive_hdfs,
                                         opt)
    runner.collect_output(blast_output_hdfs, opt)
    logger.info("all done")
  finally:
    lfs.close()
    fs.close()
    if opt.mr_dump_file is not sys.stderr:
      opt.mr_dump_file.close()

示例#60

0

显示文件

文件： common_hdfs_tests.py 项目： tivvit/pydoop

 def setUp(self):
     self.fs = hdfs.hdfs(self.hdfs_host, self.hdfs_port)
     self.wd = utils.make_wd(self.fs)