示例#1
0
 def tearDown(self):
   fs = hdfs.hdfs("", 0)
   fs.delete(self.local_wd)
   fs.close()
   fs = hdfs.hdfs("default", 0)
   fs.delete(self.hdfs_wd)
   fs.close()
示例#2
0
def write(writeFlag):
    if (writeFlag == True):
        # instantiate hadoop
        hdfs.hdfs()
        
        targetPath = config.targetPath;
        targetDirectory = config.targetDirectory;
        sourceFile = config.sourceFile;
        
        print("Target Path: " + targetPath);
        print("Target Directory: " + targetDirectory);
        print("Source Path: " + sourceFile);
        
        dumpFile = open(sourceFile, "r");
        fullText = dumpFile.read();
        dumpFile.close();
        
        # write to hadoop
        #hdfs.mkdir(targetDirectory)
        hdfs.dump(fullText, targetPath)
#hdfs.cp(sourceFile, targetPath)

#print (hdfs.ls("test4"))
#files = hdfs.ls("test4")

# read from hadoop
#hdfs.get("test4/hello.txt", "/tmp/hello.txt")
#with open("/tmp/hello.txt") as f:
#	print f.read()

#print(hdfs.ls("test", "hduser1"))
#text = hdfs.load("test/hello.txt")
#print text
示例#3
0
 def tearDown(self):
     fs = hdfs.hdfs("", 0)
     fs.delete(self.local_wd)
     fs.close()
     fs = hdfs.hdfs("default", 0)
     fs.delete(self.hdfs_wd)
     fs.close()
示例#4
0
 def capacity(self):
     fs = hdfs.hdfs("", 0)
     self.assertRaises(RuntimeError, fs.capacity)
     fs.close()
     if not hdfs.default_is_local():
         fs = hdfs.hdfs("default", 0)
         cap = fs.capacity()
         self.assertGreaterEqual(cap, 0)
示例#5
0
 def capacity(self):
     fs = hdfs.hdfs("", 0)
     self.assertRaises(RuntimeError, fs.capacity)
     fs.close()
     if not hdfs.default_is_local():
         fs = hdfs.hdfs("default", 0)
         cap = fs.capacity()
         self.assertGreaterEqual(cap, 0)
示例#6
0
 def cache(self):
     orig_fs = hdfs.hdfs(*self.hp_cases[0])
     for host, port in self.hp_cases[1:]:
         fs = hdfs.hdfs(host, port)
         self.assertTrue(fs.fs is orig_fs.fs)
         fs.close()
         self.assertFalse(orig_fs.closed)
     orig_fs.close()
     self.assertTrue(orig_fs.closed)
示例#7
0
 def cache(self):
   hdfs.hdfs._CACHE.clear()
   orig_fs = hdfs.hdfs(*self.hp_cases[0])
   for host, port in self.hp_cases[1:]:
     fs = hdfs.hdfs(host, port)
     self.assertTrue(fs.fs is orig_fs.fs)
     fs.close()
     self.assertFalse(orig_fs.closed)
   orig_fs.close()
   self.assertTrue(orig_fs.closed)
示例#8
0
def _hdfs_filesystem():
    """Retrieve references to the local and HDFS file system.

    Need to be able to specify host/port. For now, works off defaults.
    """
    fs = hdfs("default", 0)
    lfs = hdfs("", 0)
    try:
        yield fs, lfs
    finally:
        fs.close()
        lfs.close()
示例#9
0
 def cache(self):
     for (h1, p1), (h2, p2) in product(self.hp_cases, repeat=2):
         hdfs.hdfs._CACHE.clear()
         hdfs.hdfs._ALIASES = {"host": {}, "port": {}, "user": {}}  # FIXME
         with hdfs.hdfs(h1, p1) as fs1:
             with hdfs.hdfs(h2, p2) as fs2:
                 print ' * %r vs %r' % ((h1, p1), (h2, p2))
                 self.assertTrue(fs2.fs is fs1.fs)
             for fs in fs1, fs2:
                 self.assertFalse(fs.closed)
         for fs in fs1, fs2:
             self.assertTrue(fs.closed)
示例#10
0
 def cache(self):
     for (h1, p1), (h2, p2) in product(self.hp_cases, repeat=2):
         hdfs.hdfs._CACHE.clear()
         hdfs.hdfs._ALIASES = {"host": {}, "port": {}, "user": {}}  # FIXME
         with hdfs.hdfs(h1, p1) as fs1:
             with hdfs.hdfs(h2, p2) as fs2:
                 print ' * %r vs %r' % ((h1, p1), (h2, p2))
                 self.assertTrue(fs2.fs is fs1.fs)
             for fs in fs1, fs2:
                 self.assertFalse(fs.closed)
         for fs in fs1, fs2:
             self.assertTrue(fs.closed)
示例#11
0
def copyFileToHDFSFolder(localpath, hdfspath):
    """
    Copies a file from a local or HDFS to an HDFS location
    :param localpath: path to local file
    :param hdfspath: path to target file on HDFS
    :return: None
    """
    if localpath.startswith('file:/'):
        lf = H.hdfs("", 0)
    else:
        lf = H.hdfs()
    h = H.hdfs()
    lf.copy(localpath, h, hdfspath)
示例#12
0
def read(readFlag):
    print(readFlag);
    if (readFlag == True):
        targetFile = config.targetFile.strip()
        targetDirectory = config.targetDirectory.strip()
        targetPath = config.targetPath
        
        print(targetPath)
        
        # instantiate hadoop
        hdfs.hdfs()
        
        # read from hadoop
        fileToRead = hdfs.open(targetPath)
        print(fileToRead.read())
示例#13
0
def save_checkpoint(path, session=None):
    if session is None:
        session = tf.get_default_session()
    if session is None:
        raise RuntimeError("no session specified and no current session")
    saver = tf.train.Saver()
    wd = tempfile.mkdtemp(prefix="pydeep_")
    sub_d = hdfs.path.splitext(hdfs.path.basename(path))[0]
    abs_d = os.path.join(wd, sub_d)
    os.makedirs(abs_d)
    saver.save(session, os.path.join(abs_d, Model.CHECKPOINT_NAME))
    zip_fn = "%s.zip" % abs_d
    shutil.make_archive(*zip_fn.rsplit(".", 1), root_dir=abs_d)
    with hdfs.hdfs() as fs, hdfs.hdfs("", 0) as local_fs:
        local_fs.copy(zip_fn, fs, path)
示例#14
0
 def stat_on_local(self):
     wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR)
     p_ = os.path.join(wd_, make_random_str())
     if hdfs.default_is_local():
         wd, p = wd_, p_
         host = "default"
     else:
         wd, p = ('file:%s' % _ for _ in (wd_, p_))
         host = ""
     fs = hdfs.hdfs(host, 0)
     with fs.open_file(p_, 'w') as fo:
         fo.write(make_random_str())
     info = fs.get_path_info(p_)
     fs.close()
     s = hdfs.path.stat(p)
     os_s = os.stat(p_)
     for n in dir(s):
         if n.startswith('st_'):
             try:
                 exp_v = getattr(os_s, n)
             except AttributeError:
                 try:
                     exp_v = info[self.NMAP[n]]
                 except KeyError:
                     continue
                 self.assertEqual(getattr(s, n), exp_v)
     self.__check_extra_args(s, info)
     self.__check_wrapper_funcs(p)
     hdfs.rmr(wd)
示例#15
0
文件: seqal_run.py 项目: pinno/seal
    def run(self):
        if self.options is None:
            raise RuntimeError("You must call parse_cmd_line before run")

        if self.logger.isEnabledFor(logging.DEBUG):
            self.logger.debug("Running Seqal")
            self.logger.debug("Properties:\n%s", "\n".join( sorted([ "%s = %s" % (str(k), str(v)) for k,v in self.properties.iteritems() ]) ))
        self.logger.info("Input: %s; Output: %s; reference: %s", self.options.input, self.options.output, self.options.reference)

        try:
            self.hdfs = phdfs.hdfs('default', 0)
            self.__validate()

            self.remote_bin_name = tempfile.mktemp(prefix='seqal_bin.', suffix=str(random.random()), dir='')
            try:
                with self.hdfs.open_file(self.remote_bin_name, 'w') as script:
                    self.__write_pipes_script(script)

                full_name = self.hdfs.get_path_info(self.remote_bin_name)['name']

                return seal_utilities.run_pipes(full_name, self.options.input, self.options.output,
                    properties=self.properties, args_list=self.left_over_args)
            finally:
                try:
                    self.hdfs.delete(self.remote_bin_name) # delete the temporary pipes script from HDFS
                    self.logger.debug("pipes script %s deleted", self.remote_bin_name)
                except:
                    self.logger.error("Error deleting the temporary pipes script %s from HDFS", self.remote_bin_name)
                    ## don't re-raise the exception.  We're on our way out
        finally:
            if self.hdfs:
                tmp = self.hdfs
                self.hdfs = None
                tmp.close()
                self.logger.debug("HDFS closed")
示例#16
0
 def connect(self):
   for host, port in self.hp_cases:
     for user in self.u_cases:
       expected_user = user or CURRENT_USER
       fs = hdfs.hdfs(host, port, user=user)
       self.assertEqual(fs.user, expected_user)
       fs.close()
示例#17
0
    def build_map(self, top_dir):
        """\
        For each subdir (corresponding to an image class), build the full
        list of (filename, offset) pair where each bottleneck dump can be
        retrieved.

        {'dandelion': [
            ('part-m-00000', 0),
            ('part-m-00000', 8192),
            ...
            ('part-m-00003', 163840)
        ],
        'roses': [
            ('part-m-00000', 0),
            ...
        ]}
        """
        m = {}
        basename = hdfs.path.basename
        with hdfs.hdfs() as fs:
            for stat in fs.list_directory(top_dir):
                if stat['kind'] != 'directory':
                    continue
                subd = stat['name']
                positions = []
                for s in fs.list_directory(subd):
                    bname = basename(s["name"])
                    if bname.startswith("_"):
                        continue
                    assert s["size"] % self.record_size == 0
                    for i in range(0, s["size"], self.record_size):
                        positions.append((bname, i))
                m[basename(subd)] = positions
        return m
示例#18
0
 def __create_data_file(self):
     host, port, path = split_hdfs_path(self.data_file_name)
     fs = hdfs(host, port)
     f = fs.open_file(path, os.O_WRONLY, 0, 0, 0)
     f.write(self.f.getvalue())
     f.close()
     fs.close()
示例#19
0
class StorageHandler:

    hdfsobj = hdfs.hdfs()

    def __init__(self, host, port):
        self.hdfsobj = hdfs.hdfs(host,
                                 port,
                                 user="******",
                                 groups=["vagrant"])

    def pwd(self):
        return self.hdfsobj.working_directory()

    def listDirectory(self, path="/"):
        return self.hdfsobj.list_directory(path)

    def delete(self, path):
        self.hdfsobj.delete(path, False)

    def put(self, source, destination):
        hdfs.put(source, destination)

    def copyFile(self, source, destination):
        self.hdfsobj.copy(source, self.hdfsobj, destination)

    def write(self, path, mod, data):
        with hdfs.open(path, mod) as f:
            f.write(data)
示例#20
0
文件: db.py 项目: AshinGau/eventdb
 def __init__(self, tableName, host='192.168.60.64', infoTable='runInfo'):
     self.tableName = tableName
     self.conn = happybase.Connection(host)
     self.table = self.conn.table(infoTable)
     self.eventdb = self.conn.table('HEP:' + tableName)
     self.escape = escape()
     self.fs = hdfs.hdfs(host=host, port=8022, user='******')
示例#21
0
 def connect(self):
     for host, port in self.hp_cases:
         for user in self.u_cases:
             expected_user = user or CURRENT_USER
             fs = hdfs.hdfs(host, port, user=user)
             self.assertEqual(fs.user, expected_user)
             fs.close()
示例#22
0
 def setUp(self):
   if hdfs.default_is_local():
     self.root = "file:"
   else:
     fs = hdfs.hdfs("default", 0)
     self.root = "hdfs://%s:%s" % (fs.host, fs.port)
     fs.close()
示例#23
0
  def __warn_user_if_wd_maybe_unreadable(self, abs_remote_path):
    """
    Check directories above the remote module and issue a warning if
    they are not traversable by all users.

    The reasoning behind this is mainly aimed at set-ups with a centralized
    Hadoop cluster, accessed by all users, and where the Hadoop task tracker
    user is not a superuser; an example may be if you're running a shared
    Hadoop without HDFS (using only a POSIX shared file system).  The task
    tracker correctly changes user to the job requester's user for most
    operations, but not when initializing the distributed cache, so jobs who
    want to place files not accessible by the Hadoop user into dist cache fail.
    """
    host, port, path = hdfs.path.split(abs_remote_path)
    if host == '' and port == 0: # local file system
      host_port = "file:///"
    else:
      # FIXME: this won't work with any scheme other than hdfs:// (e.g., s3)
      host_port = "hdfs://%s:%s/" % (host, port)
    path_pieces = path.strip('/').split(os.path.sep)
    fs = hdfs.hdfs(host, port)
    for i in xrange(0, len(path_pieces)):
      part = os.path.join(host_port, os.path.sep.join(path_pieces[0:i+1]))
      permissions = fs.get_path_info(part)['permissions']
      if permissions & 0111 != 0111:
        self.logger.warning(
          "the remote module %s may not be readable\n" +
          "by the task tracker when initializing the distributed cache.\n" +
          "Permissions on path %s: %s", abs_remote_path, part, oct(permissions))
        break
示例#24
0
 def setUp(self):
   if hdfs.default_is_local():
     self.root = "file:"
   else:
     fs = hdfs.hdfs("default", 0)
     self.root = "hdfs://%s:%s" % (fs.host, fs.port)
     fs.close()
示例#25
0
def get():
    """ Get a handle to pydoop hdfs using the default namenode (specified in hadoop config)

    Returns:
        Pydoop hdfs handle
    """
    return hdfs.hdfs('default', 0, user=project_user())
示例#26
0
    def __warn_user_if_wd_maybe_unreadable(self, abs_remote_path):
        """
        Check directories above the remote module and issue a warning if
        they are not traversable by all users.

        The reasoning behind this is mainly aimed at set-ups with a
        centralized Hadoop cluster, accessed by all users, and where
        the Hadoop task tracker user is not a superuser; an example
        may be if you're running a shared Hadoop without HDFS (using
        only a POSIX shared file system).  The task tracker correctly
        changes user to the job requester's user for most operations,
        but not when initializing the distributed cache, so jobs who
        want to place files not accessible by the Hadoop user into
        dist cache fail.
        """
        host, port, path = hdfs.path.split(abs_remote_path)
        if host == '' and port == 0:  # local file system
            host_port = "file:///"
        else:
            # FIXME: this won't work with any scheme other than
            # hdfs:// (e.g., s3)
            host_port = "hdfs://%s:%s/" % (host, port)
        path_pieces = path.strip('/').split(os.path.sep)
        fs = hdfs.hdfs(host, port)
        for i in range(0, len(path_pieces)):
            part = os.path.join(host_port,
                                os.path.sep.join(path_pieces[0:i + 1]))
            permissions = fs.get_path_info(part)['permissions']
            if permissions & 0o111 != 0o111:
                self.logger.warning(
                    ("remote module %s may not be readable by the task "
                     "tracker when initializing the distributed cache.  "
                     "Permissions on %s: %s"), abs_remote_path, part,
                    oct(permissions))
                break
示例#27
0
 def stat_on_local(self):
     wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR)
     p_ = os.path.join(wd_, make_random_str())
     if hdfs.default_is_local():
         wd, p = wd_, p_
         host = "default"
     else:
         wd, p = ('file:%s' % _ for _ in (wd_, p_))
         host = ""
     fs = hdfs.hdfs(host, 0)
     with fs.open_file(p_, 'w') as fo:
         fo.write(make_random_str())
     info = fs.get_path_info(p_)
     fs.close()
     s = hdfs.path.stat(p)
     os_s = os.stat(p_)
     for n in dir(s):
         if n.startswith('st_'):
             try:
                 exp_v = getattr(os_s, n)
             except AttributeError:
                 try:
                     exp_v = info[self.NMAP[n]]
                 except KeyError:
                     continue
                 self.assertEqual(getattr(s, n), exp_v)
     self.__check_extra_args(s, info)
     self.__check_wrapper_funcs(p)
     hdfs.rmr(wd)
示例#28
0
def load_checkpoint(path, session=None):
    if session is None:
        session = tf.get_default_session()
    if session is None:
        raise RuntimeError("no session specified and no current session")
    wd = tempfile.mkdtemp(prefix="pydeep_")
    zip_fn = os.path.join(wd, hdfs.path.basename(path))
    with hdfs.hdfs() as fs, hdfs.hdfs("", 0) as local_fs:
        fs.copy(path, local_fs, zip_fn)
    unpack_dir = os.path.splitext(zip_fn)[0]
    shutil.unpack_archive(zip_fn, unpack_dir)
    ckpt_path = os.path.join(unpack_dir, Model.CHECKPOINT_NAME)
    metagraph_path = "%s.meta" % ckpt_path
    if not os.path.isfile(metagraph_path):
        raise RuntimeError("checkpoint files not found in %s" % zip_fn)
    saver = tf.train.import_meta_graph(metagraph_path)
    saver.restore(session, ckpt_path)
示例#29
0
 def __init__(self, context):
     super(WholeFileReader, self).__init__(context)
     self.logger = LOGGER.getChild("WholeFileReader")
     raw_split = context.get_input_split(raw=True)
     self.isplit = OpaqueInputSplit().read(io.BytesIO(raw_split))
     self.paths = self.isplit.payload
     self.n_paths = len(self.paths)
     self.fs = hdfs.hdfs()
示例#30
0
def clearLocatie():
    """
    Removes the locatie parquet table, if it exists
    :return: None
    """
    h = hdfs()
    if h.exists(LOCATIE):
        h.delete(LOCATIE)
示例#31
0
def clearTelling():
    """
    Removes the telling parquet table, if it exists
    :return: None
    """
    h = hdfs()
    if h.exists(TELLING):
        h.delete(TELLING)
示例#32
0
def get_res(output_dir):
    fs = hdfs()
    data = []
    for x in fs.list_directory(output_dir):
        if os.path.split(x['path'])[-1].startswith('part-'):
            with fs.open_file(x['path']) as f:
                data.append(f.read())
    all_data = ''.join(data)
    return pts.parse_mr_output(all_data, vtype=int)
示例#33
0
def get_res(output_dir):
    fs = hdfs()
    data = []
    for x in fs.list_directory(output_dir):
        if os.path.split(x['path'])[-1].startswith('part-'):
            with fs.open_file(x['path'], 'rt') as f:
                data.append(f.read())
    all_data = ''.join(data)
    return pts.parse_mr_output(all_data, vtype=int)
示例#34
0
 def __init__(self, context):
     super(FastaReader, self).__init__()
     self.logger = logging.getLogger(self.__class__.__name__)
     self.isplit = InputSplit(context.getInputSplit())
     self.host, self.port, self.fpath = split_hdfs_path(self.isplit.filename)
     self.fs = hdfs(self.host, self.port)
     self.file = self.fs.open_file(self.fpath, os.O_RDONLY)
     self._iterator = (SeqIO.parse(self.file, "fasta") if
                       self.isplit.offset == 0 else None)
示例#35
0
def compute_vc(input_dir):
    fs = hdfs()
    data = []
    for x in fs.list_directory(input_dir):
        with fs.open_file(x['path']) as f:
            data.append(f.read())
    all_data = ''.join(data)
    vowels = re.findall('[AEIOUY]', all_data.upper())
    return Counter(vowels)
示例#36
0
def compute_vc(input_dir):
    fs = hdfs()
    data = []
    for x in fs.list_directory(input_dir):
        with fs.open_file(x['path']) as f:
            data.append(f.read())
    all_data = ''.join(data)
    vowels = re.findall('[AEIOUY]', all_data.upper())
    return Counter(vowels)
示例#37
0
 def __init__(self, context):
     super(FastaReader, self).__init__()
     self.logger = logging.getLogger(self.__class__.__name__)
     self.isplit = InputSplit(context.getInputSplit())
     self.host, self.port, self.fpath = split_hdfs_path(
         self.isplit.filename)
     self.fs = hdfs(self.host, self.port)
     self.file = self.fs.open_file(self.fpath, os.O_RDONLY)
     self._iterator = (SeqIO.parse(self.file, "fasta")
                       if self.isplit.offset == 0 else None)
示例#38
0
 def runTest(self):
     current_user = getpass.getuser()
     cwd = os.getcwd()
     os.chdir(tempfile.gettempdir())
     for user in None, current_user, "nobody":
         expected_user = current_user
         fs = hdfs.hdfs("", 0, user=user)
         self.assertEqual(fs.user, expected_user)
         fs.close()
     os.chdir(cwd)
示例#39
0
def clearResults(name=""):
    """
    Clears target result parquet table name, or all result parquet tables if no name is given
    :param name: the target result parquet table name
    :return: None
    """
    p = path.join(RESULT_DIR, name)
    h = hdfs()
    if h.exists(p):
        h.delete(p)
示例#40
0
 def runTest(self):
     current_user = getpass.getuser()
     cwd = os.getcwd()
     os.chdir(tempfile.gettempdir())
     for user in None, current_user, "nobody":
         expected_user = current_user
         fs = hdfs.hdfs("", 0, user=user)
         self.assertEqual(fs.user, expected_user)
         fs.close()
     os.chdir(cwd)
示例#41
0
def main(directory, topic, byline):
    #get a hdfs object
    myHdfs = hdfs.hdfs()
    myPath = myHdfs.walk(directory)

    # a global variable
    global producer

    # Get a producer object
    producer = KafkaProducer(bootstrap_servers=["node4:6667"],
                             compression_type='gzip',
                             acks=1,
                             retries=2)

    for myfile in myPath:
        #Skip directory recursive
        if myfile["kind"] == "directory":
            logger.debug("ignoring %s" % (myfile))
            continue

        elif myfile["kind"] == "file":
            pass

        else:
            raise Exception, "Unknown kind %s for %s" % (myfile["kind"],
                                                         myfile["name"])

        #Skip name in particoular
        if "_SUCCESS" in myfile["name"] or "_temporary" in myfile["name"]:
            logger.debug("ignoring %s" % (myfile))
            continue

        #Skip 0 size files
        if myfile["size"] == 0:
            logger.debug("ignoring %s" % (myfile))
            continue

        logger.info("Working on %s" % (myfile["name"]))

        #call processChunk if I want to submit chunk
        if byline is False:
            processChunk(myfile, topic)

        else:
            #Otherwise submit line by line
            processLine(myfile, topic)

        #with file open
        logger.info("Completed %s" % (myfile["name"]))

        #sleep some time
        time.sleep(1)

    # for all files in HDFS
    producer.close()
示例#42
0
 def get_hosts(self):
     if hdfs.default_is_local():
         # only run on HDFS
         return
     hdfs.dump(self.data, self.hdfs_paths[0], mode="wb")
     fs = hdfs.hdfs("default", 0)
     hs = fs.get_hosts(self.hdfs_paths[0], 0, 10)
     self.assertTrue(len(hs) > 0)
     self.assertRaises(ValueError, fs.get_hosts, self.hdfs_paths[0], -10,
                       10)
     self.assertRaises(ValueError, fs.get_hosts, self.hdfs_paths[0], 0, -10)
示例#43
0
文件: treewalk.py 项目: xuande/pydoop
def main():
    fs = hdfs.hdfs()
    try:
        root = "%s/%s" % (fs.working_directory(), TEST_ROOT)
        if not isdir(fs, root):
            sys.exit("%r does not exist" % root)
        print "BS(MB)\tBYTES"
        for k, v in usage_by_bs(fs, root).iteritems():
            print "%.1f\t%d" % (k / float(MB), v)
    finally:
        fs.close()
示例#44
0
def main():
    fs = hdfs.hdfs()
    try:
        root = "%s/%s" % (fs.working_directory(), TEST_ROOT)
        if not isdir(fs, root):
            sys.exit("%r does not exist" % root)
        print "BS(MB)\tBYTES"
        for k, v in usage_by_bs(fs, root).iteritems():
            print "%.1f\t%d" % (k / float(MB), v)
    finally:
        fs.close()
示例#45
0
def Get_stock_ticks(code, time_to_market):
    import tushare as ts
    import pandas as pd
    import logging
    import datetime as dt
    import os
    import socket
    import pydoop.hdfs as hdfs
    import shutil

    if time_to_market != 0:
        logger = logging.getLogger("D_stock")
        logger_handler = logging.FileHandler("/tmp/D_stock.log")
        logger_handler.setFormatter(
            logging.Formatter("%(asctime)s -- %(message)s"))
        logger_handler.setLevel(logging.DEBUG)
        logger.setLevel(logging.DEBUG)
        logger.addHandler(logger_handler)
        logger.info(">" * 15 + code + ">" * 15)

        all_days = pd.date_range(start=str(time_to_market),
                                 end=dt.date.today(),
                                 freq="B")
        all_days = [x.date() for x in all_days]
        for day in all_days[::-1]:
            logger.info("Saving " + code + "@" + str(day) + "...")
            while True:
                try:
                    df = ts.get_tick_data(code, date=day)
                except Exception as e:
                    print e
                    continue
                break

            if df.index.size > 3:
                dir_name = "/tmp/ticks/" + str(code)
                if not os.path.exists(dir_name):
                    os.makedirs(dir_name)

                file_name = dir_name + "/" + str(day) + ".csv"
                df.to_csv(file_name)
        """
        Write to HDFS        
        """
        if os.path.exists(dir_name):
            s = hdfs.hdfs(host="spark-1", port=9000)
            if not s.exists("ticks"):
                s.create_directory("ticks")
            hdfs.put(dir_name, "./ticks/")
            shutil.rmtree(dir_name)

        logger.info("<" * 15 + code + "<" * 15)
    return (socket.gethostname(), code)
示例#46
0
 def get_hosts(self):
     if hdfs.default_is_local():
         # only run on HDFS
         return
     hdfs.dump(self.data, self.hdfs_paths[0])
     fs = hdfs.hdfs("default", 0)
     hs = fs.get_hosts(self.hdfs_paths[0], 0, 10)
     self.assertTrue(len(hs) > 0)
     self.assertRaises(
         ValueError, fs.get_hosts, self.hdfs_paths[0], -10, 10
     )
     self.assertRaises(ValueError, fs.get_hosts, self.hdfs_paths[0], 0, -10)
示例#47
0
文件: seqal_run.py 项目: okulev/seal
    def run(self):
        if self.options is None:
            raise RuntimeError("You must call parse_cmd_line before run")

        if self.logger.isEnabledFor(logging.DEBUG):
            self.logger.debug("Running Seqal")
            self.logger.debug(
                "Properties:\n%s", "\n".join(
                    sorted([
                        "%s = %s" % (str(k), str(v))
                        for k, v in self.properties.iteritems()
                    ])))
        self.logger.info("Input: %s; Output: %s; reference: %s",
                         self.options.input, self.options.output,
                         self.options.reference)

        try:
            self.hdfs = phdfs.hdfs('default', 0)
            self.__validate()

            self.remote_bin_name = tempfile.mktemp(prefix='seqal_bin.',
                                                   suffix=str(random.random()),
                                                   dir='')
            try:
                with self.hdfs.open_file(self.remote_bin_name, 'w') as script:
                    self.__write_pipes_script(script)

                full_name = self.hdfs.get_path_info(
                    self.remote_bin_name)['name']

                return hadut.run_pipes(full_name,
                                       self.options.input,
                                       self.options.output,
                                       properties=self.properties,
                                       args_list=self.left_over_args)
            finally:
                try:
                    self.hdfs.delete(
                        self.remote_bin_name
                    )  # delete the temporary pipes script from HDFS
                    self.logger.debug("pipes script %s deleted",
                                      self.remote_bin_name)
                except:
                    self.logger.error(
                        "Error deleting the temporary pipes script %s from HDFS",
                        self.remote_bin_name)
                    ## don't re-raise the exception.  We're on our way out
        finally:
            if self.hdfs:
                tmp = self.hdfs
                self.hdfs = None
                tmp.close()
                self.logger.debug("HDFS closed")
示例#48
0
def main(args):
    host, port, out_dir = hdfs.path.split(args.out_dir)
    fs = hdfs.hdfs(host, port)
    fs.create_directory(out_dir)
    join = os.path.join
    for dt, path in get_images(args.in_dir):
        out_path = join(out_dir, f"{dt.strftime(OUT_FMT)}.png")
        if not args.overwrite and fs.exists(out_path):
            continue
        with io.open(path, "rb") as fi:
            with fs.open_file(out_path, "wb") as fo:
                fo.write(fi.read())
示例#49
0
 def copy(self):
   local_fs = hdfs.hdfs('', 0)
   local_wd = make_wd(local_fs)
   from_path = os.path.join(local_wd, uuid.uuid4().hex)
   content = uuid.uuid4().hex
   with open(from_path, "w") as f:
     f.write(content)
   to_path = self._make_random_file()
   local_fs.copy(from_path, self.fs, to_path)
   local_fs.close()
   with self.fs.open_file(to_path) as f:
     self.assertEqual(f.read(), content)
   shutil.rmtree(local_wd)
示例#50
0
def main(directory, topic, byline):
    #get a hdfs object
    myHdfs = hdfs.hdfs()
    myPath = myHdfs.walk(directory)
    
    # a global variable
    global producer 

    # Get a producer object
    producer = KafkaProducer(bootstrap_servers=["node4:6667"], compression_type='gzip', acks=1, retries=2)
    
    for myfile in myPath:
        #Skip directory recursive
        if myfile["kind"] == "directory":
            logger.debug("ignoring %s" %(myfile))
            continue
        
        elif myfile["kind"] == "file":
            pass
        
        else:
            raise Exception, "Unknown kind %s for %s" %(myfile["kind"], myfile["name"])
            
        #Skip name in particoular
        if "_SUCCESS" in myfile["name"] or "_temporary" in myfile["name"]:
            logger.debug("ignoring %s" %(myfile))
            continue
        
        #Skip 0 size files
        if myfile["size"] == 0:
            logger.debug("ignoring %s" %(myfile))
            continue
        
        logger.info("Working on %s" %(myfile["name"]))

        #call processChunk if I want to submit chunk
        if byline is False:
            processChunk(myfile, topic)
            
        else:
            #Otherwise submit line by line
            processLine(myfile, topic)

        #with file open
        logger.info("Completed %s" %(myfile["name"]))
        
        #sleep some time
        time.sleep(1)
                    
    # for all files in HDFS
    producer.close()
示例#51
0
def Get_stock_ticks(code, time_to_market):
    import tushare as ts
    import pandas as pd
    import logging
    import datetime as dt
    import os
    import socket
    import pydoop.hdfs as hdfs
    import shutil

    if time_to_market !=0:
	logger = logging.getLogger("D_stock")
	logger_handler=logging.FileHandler("/tmp/D_stock.log")
	logger_handler.setFormatter(logging.Formatter("%(asctime)s -- %(message)s"))
	logger_handler.setLevel(logging.DEBUG)
	logger.setLevel(logging.DEBUG)
	logger.addHandler(logger_handler)
        logger.info(">"*15+code+">"*15)

        all_days=pd.date_range(start=str(time_to_market),end=dt.date.today(),freq="B")
        all_days=[x.date() for x in all_days]
        for day in all_days[::-1]:
            logger.info("Saving "+code+"@"+str(day)+"...")
            while True:
                try:
                    df=ts.get_tick_data(code,date=day)
                except Exception as e:
                    print e
                    continue
                break

            if df.index.size >3:
                dir_name="/tmp/ticks/"+str(code)
                if not os.path.exists(dir_name):
                    os.makedirs(dir_name)

                file_name=dir_name+"/"+str(day)+".csv"
                df.to_csv(file_name)
        """
        Write to HDFS        
        """
        if os.path.exists(dir_name):
            s=hdfs.hdfs(host="spark-1",port=9000)
            if not s.exists("ticks"):
                s.create_directory("ticks")
            hdfs.put(dir_name,"./ticks/")
            shutil.rmtree(dir_name)

        logger.info("<"*15+code+"<"*15)
    return (socket.gethostname(),code)
示例#52
0
def main(argv=sys.argv[1:]):
  parser = argparse.ArgumentParser(description=__doc__)
  parser.add_argument("--conf-dir", metavar="HADOOP_CONF_DIR")
  args = parser.parse_args(argv)
  if args.conf_dir:
    os.environ["HADOOP_CONF_DIR"] = os.path.abspath(args.conf_dir)
    hdfs.reset()
  fs = hdfs.hdfs()
  print "--- OPEN ---"
  dump_status(fs)
  print "cwd:", fs.working_directory()
  print
  fs.close()
  print "--- CLOSED ---"
  dump_status(fs)
示例#53
0
 def setUp(self):
   wd = tempfile.mkdtemp()
   wd_bn = os.path.basename(wd)
   self.local_wd = "file:%s" % wd
   fs = hdfs.hdfs("default", 0)
   fs.create_directory(wd_bn)
   self.hdfs_wd = fs.get_path_info(wd_bn)["name"]
   fs.close()
   basenames = ["test_path_%d" % i for i in xrange(2)]
   self.local_paths = ["%s/%s" % (self.local_wd, bn) for bn in basenames]
   self.hdfs_paths = ["%s/%s" % (self.hdfs_wd, bn) for bn in basenames]
   self.data = make_random_data(4*BUFSIZE + BUFSIZE/2)
   for path in self.local_paths:
     self.assertTrue(path.startswith("file:"))
   for path in self.hdfs_paths:
     if not hdfs.default_is_local():
       self.assertTrue(path.startswith("hdfs:"))
示例#54
0
 def stat(self):
     if hdfs.default_is_local():
         return
     bn = '%s%s' % (make_random_str(), UNI_CHR)
     fn = '/user/%s/%s' % (DEFAULT_USER, bn)
     fs = hdfs.hdfs("default", 0)
     p = "hdfs://%s:%s%s" % (fs.host, fs.port, fn)
     with fs.open_file(fn, 'w') as fo:
         fo.write(make_random_str())
     info = fs.get_path_info(fn)
     fs.close()
     s = hdfs.path.stat(p)
     for n1, n2 in self.NMAP.iteritems():
         attr = getattr(s, n1, None)
         self.assertFalse(attr is None)
         self.assertEqual(attr, info[n2])
     self.__check_extra_args(s, info)
     self.__check_wrapper_funcs(p)
     hdfs.rmr(p)
示例#55
0
def _clean_up_bcl_output(output_dir):
    """
    Delete prq files with no data
    """
    host, port, _ = phdfs.path.split(output_dir)
    fs = phdfs.hdfs(host, port)
    count = 0
    for item in fs.walk(output_dir):
        if item['kind'] == 'file' and item['name'].endswith('.gz') and item['size'] < 30:
            if not item['name'].startswith('hdfs://'):
                raise RuntimeError("Insanity!  Tring to delete %s!", item['name'])
            fs.delete(item['name'], recursive=False)
            count += 1
    logger.info("Removed %d empty files from bcl output", count)

    undet_path = os.path.join(output_dir, 'Undetermined')
    if phdfs.path.exists(undet_path):
        logger.info("Removing reads from Undetermined dataset %s", undet_path)
        fs.delete(undet_path)
示例#56
0
def main(argv):
  
  try:
    depth = int(argv[1])
    span = int(argv[2])
  except IndexError:
    print "Usage: python %s DEPTH SPAN" % argv[0]
    sys.exit(2)

  fs = hdfs.hdfs()
  try:
    root = "%s/%s" % (fs.working_directory(), TEST_ROOT)
    try:
      fs.delete(root)
    except IOError:
      pass
    fs.create_directory(root)
    treegen(fs, root, depth, span)
  finally:
    fs.close()
示例#57
0
    def rename_compressed_files(self, file_table):
        # find the extension
        output_files = hdfs.ls(self.output_path)
        if len(output_files) == 0:
            return

        compressor_extension = self.get_compressor_extension(output_files)
        self.log.debug("compressor extension is %s", compressor_extension)

        hdfs_host, hdfs_port, _ = hdfs.path.split(output_files[0])
        if hdfs_host == '':
            is_local_fs = True
        else:
            is_local_fs = False
            output_hdfs = hdfs.hdfs(hdfs_host, hdfs_port)

        file_table.seek(0)
        for mapid, line in enumerate(file_table.xreadlines()):
            _, _, relative_output_name = line.rstrip('\n').split('\t')
            # we expect the map task ids to be assigned in the same order as the input
            # file list, so we can match the input file to an output file by its position
            # in the input file list.
            hadoop_output = os.path.join(self.output_path, "part-%05d" % mapid) + compressor_extension
            desired_file_name = os.path.join(self.output_path, relative_output_name) + compressor_extension
            if hadoop_output != desired_file_name:
                self.log.debug("renaming %s to %s", hadoop_output, desired_file_name)
                if is_local_fs:
                    # Though we could transparently use hdfs.move for both local fs and hdfs,
                    # using native methods for the local fs should be faster.
                    # os.renames automatically creates necessary parent directories for destination.
                    os.renames(urlparse(hadoop_output).path, urlparse(desired_file_name).path)
                else:
                    # create the output subdirectory, if necessary
                    dirname = os.path.dirname(relative_output_name)
                    if dirname:
                        output_hdfs.create_directory( os.path.join(self.output_path, dirname) )
                    if output_hdfs.exists(desired_file_name):
                        raise RuntimeError("Can't overwrite file in output directory: %s" % desired_file_name)
                    output_hdfs.move(hadoop_output, output_hdfs, desired_file_name)
示例#58
0
 def setUp(self):
     self.fs = hdfs()
     self.wd = utils.make_wd(self.fs)
示例#59
0
文件: mr_blast.py 项目: crs4/vispa
def main(argv):

  parser = make_parser()
  opt, args = parser.parse_args()
  try:
    input_fasta = args[0]
    db_archive = args[1]
  except IndexError:
    parser.print_help()
    sys.exit(2)

  STR_GENERATOR.prefix = os.path.basename(input_fasta)

  logger = logging.getLogger()
  for h in logger.handlers:
    logger.removeHandler(h)
  opt.log_level_str = opt.log_level
  opt.log_level = getattr(logging, opt.log_level)
  kwargs = {'format': LOG_FORMAT,
            'datefmt': LOG_DATEFMT,
            'level': opt.log_level}
  if opt.log_file:
    kwargs['filename'] = opt.log_file
  logging.basicConfig(**kwargs)

  logger.debug("cli args: %r" % (args,))
  logger.debug("cli opts: %s" % opt)

  if opt.mr_dump_file:
    opt.mr_dump_file = open(opt.mr_dump_file, "w")
  else:
    opt.mr_dump_file = sys.stderr
  
  if not opt.blast_db:
    opt.blast_db = os.path.basename(db_archive).split(".", 1)[0]
    logger.info("--blast-db not provided: setting to %r" % opt.blast_db)
  
  os.environ["HADOOP_HOME"] = opt.hadoop_home
  if not opt.hadoop:
    opt.hadoop = os.path.join(opt.hadoop_home, "bin/hadoop")
  if not opt.hadoop_conf_dir:
    opt.hadoop_conf_dir = os.path.join(opt.hadoop_home, "conf")
  os.environ["HADOOP_CONF_DIR"] = opt.hadoop_conf_dir
  hdfs.reset()

  fs = hdfs.hdfs()
  logger.debug("hdfs params: host=%s, port=%d" % (fs.host, fs.port))
  lfs = hdfs.hdfs("", 0)
  runner = Runner(fs, lfs, logger)

  try:
    db_archive_hdfs = runner.upload_archive(db_archive)
    blast_input_hdfs = runner.run_f2t(input_fasta, opt)
    blast_output_hdfs = runner.run_blast(blast_input_hdfs, db_archive_hdfs,
                                         opt)
    runner.collect_output(blast_output_hdfs, opt)
    logger.info("all done")
  finally:
    lfs.close()
    fs.close()
    if opt.mr_dump_file is not sys.stderr:
      opt.mr_dump_file.close()
示例#60
0
 def setUp(self):
     self.fs = hdfs.hdfs(self.hdfs_host, self.hdfs_port)
     self.wd = utils.make_wd(self.fs)