示例#1
0
 def good(self):
     base_path = make_random_str()
     for path in base_path, base_path + UNI_CHR:
         hdfs.dump("foo\n", path)
         self.assertTrue(hdfs.path.exists(path))
         hdfs.rmr(path)
         self.assertFalse(hdfs.path.exists(path))
示例#2
0
 def samefile_link(self):
     wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR)
     wd = 'file:%s' % wd_
     link = os.path.join(wd_, make_random_str())
     os.symlink(wd_, link)
     self.assertTrue(hdfs.path.samefile('file:%s' % link, 'file:%s' % wd_))
     hdfs.rmr(wd)
示例#3
0
def clean_empty_dirs(remote_basedir):
    LOGGER = logging.getLogger(__name__)
    deleted_dirs = []
    ## Directory structure is {remote_basedir}/{year}/{month}
    year_dirs = hdfs.ls(remote_basedir)
    # Do an ls to find all month dirs
    for year_dir in year_dirs:
        month_dirs = hdfs.ls(hdfs.path.join(remote_basedir, year_dir))
        # Check to see if month dirs are empty
        month_dirs_deleted = 0
        for month_dir in month_dirs:
            files = hdfs.ls(hdfs.path.join(remote_basedir, year_dir,
                                           month_dir))
            if not files:
                LOGGER.debug(
                    "Directory {0} is empty, deleting it".format(month_dir))
                hdfs.rmr(month_dir)
                deleted_dirs.append(month_dir)
                month_dirs_deleted += 1

        if month_dirs_deleted == len(month_dirs):
            # Deleted all month sub-directories, so delete year directory too
            LOGGER.debug(
                "Directory {0} is empty, deleting it".format(year_dir))
            hdfs.rmr(year_dir)
            deleted_dirs.append(year_dir)
    return deleted_dirs
示例#4
0
 def stat_on_local(self):
     wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR)
     p_ = os.path.join(wd_, make_random_str())
     if hdfs.default_is_local():
         wd, p = wd_, p_
         host = "default"
     else:
         wd, p = ('file:%s' % _ for _ in (wd_, p_))
         host = ""
     fs = hdfs.hdfs(host, 0)
     with fs.open_file(p_, 'w') as fo:
         fo.write(make_random_str())
     info = fs.get_path_info(p_)
     fs.close()
     s = hdfs.path.stat(p)
     os_s = os.stat(p_)
     for n in dir(s):
         if n.startswith('st_'):
             try:
                 exp_v = getattr(os_s, n)
             except AttributeError:
                 try:
                     exp_v = info[self.NMAP[n]]
                 except KeyError:
                     continue
                 self.assertEqual(getattr(s, n), exp_v)
     self.__check_extra_args(s, info)
     self.__check_wrapper_funcs(p)
     hdfs.rmr(wd)
示例#5
0
 def good(self):
     base_path = make_random_str()
     for path in base_path, base_path + UNI_CHR:
         hdfs.dump("foo\n", path)
         self.assertTrue(hdfs.path.exists(path))
         hdfs.rmr(path)
         self.assertFalse(hdfs.path.exists(path))
示例#6
0
 def stat_on_local(self):
     wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR)
     p_ = os.path.join(wd_, make_random_str())
     if hdfs.default_is_local():
         wd, p = wd_, p_
         host = "default"
     else:
         wd, p = ('file:%s' % _ for _ in (wd_, p_))
         host = ""
     fs = hdfs.hdfs(host, 0)
     with fs.open_file(p_, 'w') as fo:
         fo.write(make_random_str())
     info = fs.get_path_info(p_)
     fs.close()
     s = hdfs.path.stat(p)
     os_s = os.stat(p_)
     for n in dir(s):
         if n.startswith('st_'):
             try:
                 exp_v = getattr(os_s, n)
             except AttributeError:
                 try:
                     exp_v = info[self.NMAP[n]]
                 except KeyError:
                     continue
                 self.assertEqual(getattr(s, n), exp_v)
     self.__check_extra_args(s, info)
     self.__check_wrapper_funcs(p)
     hdfs.rmr(wd)
示例#7
0
 def samefile_link(self):
     wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR)
     wd = 'file:%s' % wd_
     link = os.path.join(wd_, make_random_str())
     os.symlink(wd_, link)
     self.assertTrue(hdfs.path.samefile('file:%s' % link, 'file:%s' % wd_))
     hdfs.rmr(wd)
示例#8
0
def run_mapred(model, input_dirs, output_dir, nmaps, log_level, collate=False):
    wd = tempfile.mkdtemp(prefix="pydeep_")
    zip_fn = os.path.join(wd, "{}.zip".format(PACKAGE))
    shutil.make_archive(*zip_fn.rsplit(".", 1), base_dir=PACKAGE)
    if nmaps > len(input_dirs):
        nmaps = len(input_dirs)
        LOGGER.warn("Not enough input dirs, will only do %d splits" % nmaps)
    splits = common.balanced_split(input_dirs, nmaps)
    splits_uri = "pydoop_splits_%s" % uuid.uuid4().hex
    with hdfs.open(splits_uri, 'wb') as f:
        write_opaques([OpaqueInputSplit(1, _) for _ in splits], f)
    submitter = PydoopSubmitter()
    properties = {
        common.GRAPH_ARCH_KEY: model.name,
        common.LOG_LEVEL_KEY: log_level,
        common.NUM_MAPS_KEY: nmaps,
        common.PYDOOP_EXTERNALSPLITS_URI_KEY: splits_uri,
    }
    submitter.set_args(
        argparse.Namespace(
            D=list(properties.items()),
            avro_input=None,
            avro_output=None,
            cache_archive=None,
            cache_file=None,
            disable_property_name_conversion=True,
            do_not_use_java_record_reader=True,
            do_not_use_java_record_writer=True,
            entry_point="__main__",
            hadoop_conf=None,
            input=input_dirs[0],  # does it matter?
            input_format=None,
            job_conf=None,
            job_name="dump_weights",
            keep_wd=False,
            libjars=None,
            log_level=log_level,
            module=os.path.splitext(os.path.basename(__file__))[0],
            no_override_env=False,
            no_override_home=False,
            no_override_ld_path=False,
            no_override_path=False,
            no_override_pypath=False,
            num_reducers=0,
            output=output_dir,
            output_format=None,
            pretend=False,
            pstats_dir=None,
            python_program=sys.executable,
            python_zip=[zip_fn],
            set_env=None,
            upload_archive_to_cache=None,
            upload_file_to_cache=[__file__],
        ))
    submitter.run()
    hdfs.rmr(splits_uri)
    if collate:
        collate_mapred_output(output_dir)
    shutil.rmtree(wd)
示例#9
0
def _try_remove_hdfs_dir(path):
    try:
        phdfs.rmr(path)
        return True
    except StandardError as e:
        logger.error("Error while trying to remove directory %s", path)
        logger.exception(e)
    return False
示例#10
0
文件: test_path.py 项目: crs4/pydoop
 def realpath(self):
     wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR)
     wd = 'file:%s' % wd_
     link = os.path.join(wd_, make_random_str())
     os.symlink(wd_, link)
     expected_path = 'file:%s' % os.path.realpath(wd_)
     self.assertEqual(hdfs.path.realpath('file:%s' % link), expected_path)
     hdfs.rmr(wd)
示例#11
0
 def __clean_wd(self):
     if self.remote_wd:
         try:
             self.logger.debug("Removing temporary working directory %s",
                               self.remote_wd)
             hdfs.rmr(self.remote_wd)
         except IOError:
             pass
示例#12
0
 def realpath(self):
     wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR)
     wd = 'file:%s' % wd_
     link = os.path.join(wd_, make_random_str())
     os.symlink(wd_, link)
     expected_path = 'file:%s' % os.path.realpath(wd_)
     self.assertEqual(hdfs.path.realpath('file:%s' % link), expected_path)
     hdfs.rmr(wd)
示例#13
0
 def remove_directory(self,hdfs_path):
     if (hdfs_path == ""):
         print "No directory specified to delete!"
         return False
     elif(self.file_exist(hdfs_path)==False):
         return False
     hdfs.rmr(hdfs_path)
     return True
示例#14
0
def cleanup(out_pathset):
  # clean-up job output
  for path in out_pathset:
    try:
      print >> sys.stderr, "Deleting output path", path
      phdfs.rmr(path)
    except StandardError as e:
      print >> sys.stderr, "Error!", str(e)
示例#15
0
def _try_remove_hdfs_dir(path):
    try:
        phdfs.rmr(path)
        return True
    except StandardError as e:
        logger.error("Error while trying to remove directory %s", path)
        logger.exception(e)
    return False
示例#16
0
 def _clean_up(*paths):
     for p in paths:
         try:
             log.debug("Removing path: %s", p)
             phdfs.rmr(p)
         except StandardError as e:
             log.warning("Error deleting path %s", p)
             log.exception(e)
def runSparkNumASesInROAs(sc, ip_type):

    roa_prefix_asn = "/hdfs-to-local-path/rpki/ripe/ripe-new-objects/roa-prefix-asn/*"

    savePath = "/hdfs-to-local-path/rpki/results/roas-covering-AScnt-%s" % ip_type
    localPath = "/home/tjchung/research/rpki/src/spark/results/roas-covering-AScnt-%s" % ip_type

    try:
        hdfs.rmr(savePath)
    except:
        pass

    tals = [
        "apnic", "apnic-iana", "apnic-afrinic", "apnic-arin", "apnic-lacnic",
        "apnic-ripe", "lacnic", "ripencc", "arin", "afrinic", "localcert"
    ]

    k  = sc.textFile(roa_prefix_asn)\
        .filter(lambda line: "#" not in line)\
        .map(lambda line: line.rstrip().split("\t"))\
        .filter(lambda (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal): isIPv4v6(prefix_addr, ip_type))\
        .distinct()\
        .map(lambda (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal): ((time, tal), asID))\
        .groupByKey()\
        .map(lambda ( (time, tal), num_ases): (time, tal, len(set(num_ases))))

    sqlContext = SQLContext(sc)
    df = sqlContext.createDataFrame(k, ['date', 'tal', 'num_ASes'])

    grouped = df.rdd\
                .map(lambda row: (row.date, (row.tal, row.num_ASes)))\
                .groupByKey()

    def make_row(kv):
        k, vs = kv  # time: [(-1, cnt), (0, cnt), (1, cnt)] ...
        tmp = dict(list(vs) + [("date", k)])
        return Row(**{k: tmp.get(k, 0) for k in ["date"] + tals})

    reshaped = sqlContext.createDataFrame(grouped.map(make_row))

    k = reshaped.rdd\
            .map(lambda row: (row['date'], row["apnic"], row["apnic-iana"], row["apnic-afrinic"], row["apnic-arin"], row["apnic-lacnic"], row["apnic-ripe"], row["lacnic"], row["ripencc"], row["arin"], row["afrinic"], row["localcert"]))\
            .map(toTSV)

    k.saveAsTextFile(savePath)

    try:
        shutil.rmtree(localPath)
    except:
        pass

    try:
        os.makedirs(localPath)
    except:
        pass

    hdfs.get(savePath, localPath)
    mergeAndSort(localPath)
示例#18
0
 def realpath(self):
     wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR)
     wd = 'file:%s' % wd_
     link = os.path.join(wd_, make_random_str())
     os.symlink(wd_, link)
     expected_path = ('file:%s%s' % ("/private", wd_)
                      if sys.platform == "darwin" else 'file:%s' % wd_)
     self.assertEqual(hdfs.path.realpath('file:%s' % link), expected_path)
     hdfs.rmr(wd)
示例#19
0
 def __clean_wd(self):
   if self.remote_wd:
     try:
       self.logger.debug(
         "Removing temporary working directory %s", self.remote_wd
         )
       hdfs.rmr(self.remote_wd)
     except IOError:
       pass
示例#20
0
def delete_files(remote_basedir, retention):
    inodes = walk_remotely(remote_basedir)
    now = time.time()
    deleted_files = []
    for inode in inodes:
        if now - inode['last_mod'] > retention and inode['kind'] == 'file':
            LOGGER.debug("Deleting file {0}".format(inode['path']))
            hdfs.rmr(inode['path'])
            deleted_files.append(inode['path'])
    return deleted_files
示例#21
0
 def realpath(self):
     wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR)
     wd = 'file:%s' % wd_
     link = os.path.join(wd_, make_random_str())
     os.symlink(wd_, link)
     expected_path = ('file:%s%s' % ("/private", wd_)
                      if sys.platform == "darwin"
                      else 'file:%s' % wd_)
     self.assertEqual(hdfs.path.realpath('file:%s' % link), expected_path)
     hdfs.rmr(wd)
def delete_files(remote_basedir, retention):
    inodes = walk_remotely(remote_basedir)
    now = time.time()
    deleted_files = []
    for inode in inodes:
        if now - inode['last_mod'] > retention and inode['kind'] == 'file':
            LOGGER.debug("Deleting file {0}".format(inode['path']))
            hdfs.rmr(inode['path'])
            deleted_files.append(inode['path'])
    return deleted_files
示例#23
0
 def runTest(self):
     path = make_random_str() + UNI_CHR
     hdfs.dump("foo\n", path)
     st = hdfs.path.stat(path)
     atime, mtime = [getattr(st, 'st_%stime' % _) for _ in 'am']
     new_atime, new_mtime = atime + 100, mtime + 200
     hdfs.path.utime(path, (new_atime, new_mtime))
     st = hdfs.path.stat(path)
     self.assertEqual(st.st_atime, new_atime)
     self.assertEqual(st.st_mtime, new_mtime)
     hdfs.rmr(path)
示例#24
0
 def runTest(self):
     path = make_random_str() + UNI_CHR
     hdfs.dump("foo\n", path)
     st = hdfs.path.stat(path)
     atime, mtime = [getattr(st, 'st_%stime' % _) for _ in 'am']
     new_atime, new_mtime = atime + 100, mtime + 200
     hdfs.path.utime(path, (new_atime, new_mtime))
     st = hdfs.path.stat(path)
     self.assertEqual(st.st_atime, new_atime)
     self.assertEqual(st.st_mtime, new_mtime)
     hdfs.rmr(path)
def runSparkNumPrefixWithMaxlen(sc, ip_type="ipv4"):

    roa_prefix_asn = "/hdfs-to-local-path/rpki/ripe/ripe-new-objects/roa-prefix-asn/*"
    localPath = "/home/tjchung/research/rpki/src/spark/results/roa-prefix-with-maxlength"
    savePath = "/hdfs-to-local-path/rpki/results/roa-prefix-with-maxlength"

    try:
        hdfs.rmr(savePath)
    except:
        pass
    k  = sc.textFile(roa_prefix_asn)\
        .filter(lambda line: "#" not in line)\
        .map(lambda line: line.rstrip().split("\t"))\
        .filter(lambda (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal): isIPv4v6(prefix_addr, ip_type) )\
        .map(lambda (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal): (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal))\
        .distinct()\
        .map(lambda (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal): ((time,  str(int( (prefix_len != maxlen) and maxlen != "None" ))), 1))\
        .reduceByKey(lambda a, b: a+ b)\
        .map(lambda ((time, hasMaxlen), cnt): (time, hasMaxlen, cnt))

    sqlContext = SQLContext(sc)
    df = sqlContext.createDataFrame(k, ['date', 'hasMaxlen', 'cnt'])

    grouped = df.rdd\
                .map(lambda row: (row.date, (row.hasMaxlen, row.cnt)))\
                .groupByKey()

    def make_row(kv):
        k, vs = kv
        tmp = dict(list(vs) + [("date", k)])
        return Row(**{k: tmp.get(k, 0)
                      for k in ["date", "0", "1"]})  # 1 means has a maxlen

    reshaped = sqlContext.createDataFrame(grouped.map(make_row))

    k = reshaped.rdd\
            .map(lambda row: (row['date'], row['0'], row['1']))\
            .map(toTSV)

    k.saveAsTextFile(savePath)

    try:
        shutil.rmtree(localPath)
    except:
        pass

    try:
        os.makedirs(localPath)
    except:
        pass

    hdfs.get(savePath, localPath)
    mergeAndSort(localPath)
示例#26
0
def _tear_down_flink_session(app_id):
    if not app_id:
        raise ValueError("_tear_down_flink_session: empty app id!")

    cmd = ['yarn', 'application', '-kill', app_id]
    logger.info("Killing flink session with app id '%s'", app_id)
    logger.debug("Command: %s", cmd)
    subprocess.check_call(cmd)
    # clean up temporary yarn session files, if any
    path = ".flink/" + app_id
    if phdfs.path.exists(path):
        logger.info("Also removing the session's temporary files in %s", path)
        phdfs.rmr(path)
示例#27
0
def _tear_down_flink_session(app_id):
    if not app_id:
        raise ValueError("_tear_down_flink_session: empty app id!")

    cmd = [ 'yarn', 'application', '-kill', app_id ]
    logger.info("Killing flink session with app id '%s'", app_id)
    logger.debug("Command: %s", cmd)
    subprocess.check_call(cmd)
    # clean up temporary yarn session files, if any
    path = ".flink/" + app_id
    if phdfs.path.exists(path):
        logger.info("Also removing the session's temporary files in %s", path)
        phdfs.rmr(path)
示例#28
0
    def execute(self):
        """
        Execute workflow in dedicated directory
        """
        hdfs_output_dir = "workflow_output_{}".format(time.time())
        logger.debug("Setting up workflow")
        logger.debug("CWD: %s", os.getcwd())
        logger.debug("workflow output directory: %s", hdfs_output_dir)
        cmd = [self._program] + [str(arg) for arg in self._args]
        cmd.append(self._input_dir)
        cmd.append(hdfs_output_dir)
        logger.debug("workflow command: %s", cmd)
        wf_logfile = os.path.abspath(GlobalConf['workflow_logfile'])
        logger.info("Executing worflow")
        logger.info("Writing workflow log to %s", wf_logfile)

        self._clear_caches()

        try:
            with open(wf_logfile, 'a') as f:
                logger.info("Starting workflow")
                start_time = time.time()
                retcode = subprocess.call(cmd,
                                          stdout=f,
                                          stderr=subprocess.STDOUT)
            end_time = time.time()
            run_time = end_time - start_time

            attempt_info = AttemptInfo(cmd, retcode, wf_logfile, run_time)

            if retcode == 0:
                logger.info("Workflow finished")
                logger.info("Attempt took %0.2f seconds", run_time)
                bcl, align = self._get_part_times_from_log(wf_logfile)
                attempt_info.bcl_secs = bcl
                attempt_info.align_secs = align
            else:
                logger.info("Workflow FAILED with exit code %s", retcode)
            return attempt_info
        finally:
            try:
                if phdfs.path.exists(hdfs_output_dir):
                    logger.debug(
                        "Removing workflow's temporary output directory %s",
                        hdfs_output_dir)
                    phdfs.rmr(hdfs_output_dir)
            except StandardError as e:
                logger.error(
                    "Failed to clean up workflow's output directory  %s",
                    hdfs_output_dir)
                logger.exception(e)
示例#29
0
 def test_isdir(self):
     for path in self.path, self.u_path:
         self.assertFalse(hdfs.path.isdir(path))
         try:
             hdfs.dump("foo\n", path)
             self.assertFalse(hdfs.path.isdir(path))
             hdfs.rmr(path)
             hdfs.mkdir(path)
             self.assertTrue(hdfs.path.isdir(path))
         finally:
             try:
                 hdfs.rmr(path)
             except IOError:
                 pass
示例#30
0
 def test_isdir(self):
   path = utils.make_random_str()
   self.assertFalse(hdfs.path.isdir(path))
   try:
     hdfs.dump("foo\n", path)
     self.assertFalse(hdfs.path.isdir(path))
     hdfs.rmr(path)
     hdfs.mkdir(path)
     self.assertTrue(hdfs.path.isdir(path))
   finally:
     try:
       hdfs.rmr(path)
     except IOError:
       pass
示例#31
0
 def test_kind(self):
   path = utils.make_random_str()
   self.assertTrue(hdfs.path.kind(path) is None)
   try:
     hdfs.dump("foo\n", path)
     self.assertEqual('file', hdfs.path.kind(path))
     hdfs.rmr(path)
     hdfs.mkdir(path)
     self.assertEqual('directory', hdfs.path.kind(path))
   finally:
     try:
       hdfs.rmr(path)
     except IOError:
       pass
示例#32
0
 def test_kind(self):
     for path in self.path, self.u_path:
         self.assertTrue(hdfs.path.kind(path) is None)
         try:
             hdfs.dump("foo\n", path)
             self.assertEqual('file', hdfs.path.kind(path))
             hdfs.rmr(path)
             hdfs.mkdir(path)
             self.assertEqual('directory', hdfs.path.kind(path))
         finally:
             try:
                 hdfs.rmr(path)
             except IOError:
                 pass
示例#33
0
 def test_isdir(self):
     for path in self.path, self.u_path:
         self.assertFalse(hdfs.path.isdir(path))
         try:
             hdfs.dump("foo\n", path)
             self.assertFalse(hdfs.path.isdir(path))
             hdfs.rmr(path)
             hdfs.mkdir(path)
             self.assertTrue(hdfs.path.isdir(path))
         finally:
             try:
                 hdfs.rmr(path)
             except IOError:
                 pass
示例#34
0
 def test_kind(self):
   path = utils.make_random_str()
   self.assertTrue(hdfs.path.kind(path) is None)
   try:
     hdfs.dump("foo\n", path)
     self.assertEqual('file', hdfs.path.kind(path))
     hdfs.rmr(path)
     hdfs.mkdir(path)
     self.assertEqual('directory', hdfs.path.kind(path))
   finally:
     try:
       hdfs.rmr(path)
     except IOError:
       pass
示例#35
0
 def test_isdir(self):
   path = utils.make_random_str()
   self.assertFalse(hdfs.path.isdir(path))
   try:
     hdfs.dump("foo\n", path)
     self.assertFalse(hdfs.path.isdir(path))
     hdfs.rmr(path)
     hdfs.mkdir(path)
     self.assertTrue(hdfs.path.isdir(path))
   finally:
     try:
       hdfs.rmr(path)
     except IOError:
       pass
示例#36
0
 def test_kind(self):
     for path in self.path, self.u_path:
         self.assertTrue(hdfs.path.kind(path) is None)
         try:
             hdfs.dump("foo\n", path)
             self.assertEqual('file', hdfs.path.kind(path))
             hdfs.rmr(path)
             hdfs.mkdir(path)
             self.assertEqual('directory', hdfs.path.kind(path))
         finally:
             try:
                 hdfs.rmr(path)
             except IOError:
                 pass
示例#37
0
def main(argv=None):

    os.chdir(os.path.dirname(os.path.abspath(__file__)))
    wd = tempfile.mkdtemp(prefix="pydeep_")
    zip_fn = os.path.join(wd, "{}.zip".format(PACKAGE))
    shutil.make_archive(*zip_fn.rsplit(".", 1), base_dir=PACKAGE)

    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = WORKER
    args.module = WORKER
    args.upload_file_to_cache = ['%s.py' % WORKER]
    args.python_zip = [zip_fn]
    args.do_not_use_java_record_reader = True
    args.num_reducers = 0
    if args.seed:
        LOGGER.info("setting random seed to %d", args.seed)
        random.seed(args.seed)

    model = models.get_model_info(args.architecture)
    graph = model.load_prep()
    bneck_tensor = model.get_bottleneck(graph)
    bneck_store = ioformats.BottleneckStore(
        bneck_tensor.shape[1].value, bneck_tensor.dtype
    )
    bneck_map = bneck_store.build_map(args.input)
    LOGGER.info("%d subdirs, %r bottlenecks" %
                (len(bneck_map), [len(_) for _ in bneck_map.values()]))
    splits_path = os.path.join(args.input, '_' + uuid.uuid4().hex)
    generate_input_splits(args.num_maps, bneck_map, splits_path)
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.properties.update({
        common.BNECKS_DIR_KEY: args.input,
        common.EVAL_STEP_INTERVAL_KEY: args.eval_step_interval,
        common.GRAPH_ARCH_KEY: args.architecture,
        common.LEARNING_RATE_KEY: args.learning_rate,
        common.LOG_LEVEL_KEY: args.log_level,
        common.NUM_MAPS_KEY: args.num_maps,
        common.NUM_STEPS_KEY: args.num_steps,
        common.PYDOOP_EXTERNALSPLITS_URI_KEY: splits_path,
        common.TRAIN_BATCH_SIZE_KEY: args.train_batch_size,
        common.VALIDATION_BATCH_SIZE_KEY: args.validation_batch_size,
        common.VALIDATION_PERCENT_KEY: args.validation_percent,
    })
    if args.seed:
        submitter.properties[common.SEED_KEY] = args.seed
    submitter.run()
    hdfs.rmr(splits_path)
    shutil.rmtree(wd)
def upsert_a_folder(src_dir, hdfs_tgt_dir, filename, debug):
    src_fname = os.path.join(src_dir, filename)
    tgt_fname = os.path.join(hdfs_tgt_dir, filename)
    # get target file info
    tgt_dict = {}
    try:
        lsl = hdfs.lsl(hdfs_tgt_dir)
        for i in lsl:
            try:
                tgt_dict[os.path.basename(i["name"])] = i["last_mod"]
            except:
                pass
    except:
        pass
    print "hdfs tgt_dict=", tgt_dict

    # get source info
    src_fs = glob.glob(src_fname)
    print "src_fs=", src_fs
    for sf in src_fs:
        # get source file info
        try:
            src_ctime_int = int(os.path.getctime(sf))
        except:
            src_ctime_int = None
        print "src_ctime_int=", src_ctime_int

        src_bfname = os.path.basename(sf)
        tgt_fname = os.path.join(hdfs_tgt_dir, src_bfname)
        # put or rm/put
        try:
            if not src_bfname in tgt_dict:
                #insert new one
                if debug == 'N':
                    hdfs.put(sf, hdfs_tgt_dir)
                else:
                    print "DEBUG: put ", src_bfname, "to", hdfs_tgt_dir
            elif src_ctime_int > tgt_dict[src_bfname]:
                if debug == 'N':
                    hdfs.rmr(tgt_fname)
                    hdfs.put(sf, hdfs_tgt_dir)
                else:
                    print "DEBUG: replace ", tgt_fname, "by", sf
            else:
                print tgt_fname, "has a newer mdate than", sf, ":", src_ctime_int
        except:
            e = sys.exc_info()[0]
            print "Error: ", e
示例#39
0
def main(argv):
  logger = logging.getLogger("main")
  logger.setLevel(logging.INFO)
  local_input = argv[1]
  with open(MR_SCRIPT) as f:
    pipes_code = pts.add_sys_path(f.read())
  runner = hadut.PipesRunner(prefix=PREFIX, logger=logger)
  runner.set_input(local_input, put=True)
  runner.set_exe(pipes_code)
  runner.run()
  res = runner.collect_output()
  runner.clean()
  hdfs.rmr(HDFS_WD)
  logger.info("checking results")
  expected_res = local_vc(local_input)
  logger.info(check(res, expected_res))
示例#40
0
def main(argv):
    logger = logging.getLogger("main")
    logger.setLevel(logging.INFO)
    local_input = argv[1]
    with open(MR_SCRIPT) as f:
        pipes_code = pts.add_sys_path(f.read())
    runner = hadut.PipesRunner(prefix=PREFIX, logger=logger)
    runner.set_input(local_input, put=True)
    runner.set_exe(pipes_code)
    runner.run()
    res = runner.collect_output()
    runner.clean()
    hdfs.rmr(HDFS_WD)
    logger.info("checking results")
    expected_res = local_vc(local_input)
    logger.info(check(res, expected_res))
示例#41
0
def clean_directory(dir, spam_life=spam_ttl):
    # Accepts a directory name and deletes anything older than TTL in days
    file_list = []

    # check the existance of the directory
    if hdfs.path.exists(dir):
        # get a list of all files there
        file_list = hdfs.lsl(dir)

    # loop through the file list
    for listing in file_list:
        # get the last access time of the file and compare to spam lifetime
        if time.time() - listing[
                'last_access'] > 86400 * spam_life:  # 86400 seconds in a day
            # if its too old delete it and log that it was deleted
            logger.info('Deleting ' + listing['name'])
            hdfs.rmr(listing['name'])
示例#42
0
    def execute(self):
        """
        Execute workflow in dedicated directory
        """
        hdfs_output_dir = "workflow_output_{}".format(time.time())
        logger.debug("Setting up workflow")
        logger.debug("CWD: %s", os.getcwd())
        logger.debug("workflow output directory: %s", hdfs_output_dir)
        cmd = [ self._program ] + [ str(arg) for arg in  self._args ]
        cmd.append(self._input_dir)
        cmd.append(hdfs_output_dir)
        logger.debug("workflow command: %s", cmd)
        wf_logfile = os.path.abspath(GlobalConf['workflow_logfile'])
        logger.info("Executing worflow")
        logger.info("Writing workflow log to %s", wf_logfile)

        self._clear_caches()

        try:
            with open(wf_logfile, 'a') as f:
                logger.info("Starting workflow")
                start_time = time.time()
                retcode = subprocess.call(cmd, stdout=f, stderr=subprocess.STDOUT)
            end_time = time.time()
            run_time = end_time - start_time

            attempt_info = AttemptInfo(cmd, retcode, wf_logfile, run_time)

            if retcode == 0:
                logger.info("Workflow finished")
                logger.info("Attempt took %0.2f seconds", run_time)
                bcl, align = self._get_part_times_from_log(wf_logfile)
                attempt_info.bcl_secs = bcl
                attempt_info.align_secs = align
            else:
                logger.info("Workflow FAILED with exit code %s", retcode)
            return attempt_info
        finally:
            try:
                if phdfs.path.exists(hdfs_output_dir):
                    logger.debug("Removing workflow's temporary output directory %s", hdfs_output_dir)
                    phdfs.rmr(hdfs_output_dir)
            except StandardError as e:
                logger.error("Failed to clean up workflow's output directory  %s", hdfs_output_dir)
                logger.exception(e)
示例#43
0
def main(args):
    logger.setLevel(logging.DEBUG)

    options = parse_args(args)
    logger.setLevel(options.log_level)

    logger.info("Running workflow with the following configuration")
    logger.info("n_nodes: %d", options.n_nodes)
    logger.info("bcl converter jar %s", options.jar_path)
    logger.info("Other conf:\n%s", GlobalConf)

    start_time = time.time()
    try:
        if options.skip_bcl:
            logger.info("Skipping bcl conversion as requested")
            tmp_output_dir = options.input
        else:
            tmp_output_dir = mk_hdfs_temp_dir('bcl_output_')
            logger.debug("Temporary output directory on HDFS: %s",
                         tmp_output_dir)
            run_bcl_converter(options.input, tmp_output_dir, options.n_nodes,
                              options.jar_path)
        time_after_bcl = time.time()
        run_alignments(tmp_output_dir, options.output)
        time_after_align = time.time()
    finally:
        if options.keep_intermediate:
            logger.info("Leaving intermediate data in directory %s",
                        tmp_output_dir)
        elif not options.skip_bcl:  # if we skipped bcl, tmp_conf_dir is the input directory
            try:
                phdfs.rmr(tmp_output_dir)
            except StandardError as e:
                logger.error(
                    "Error while trying to remove temporary output directory %s",
                    tmp_output_dir)
                logger.exception(e)

    finish_time = time.time()
    logger.info("Seconds for bcl conversion:  %0.2f",
                (time_after_bcl - start_time))
    logger.info("Seconds for alignment:  %0.2f",
                (time_after_align - time_after_bcl))
    logger.info("Total execution time:  %0.2f", (finish_time - start_time))
示例#44
0
    def setUp(self):
        import gzip
        import shutil
        try:
            os.utime(landing_zone + '/badfile.txt', None)
        except OSError:
            f = open(landing_zone + '/badfile.txt', 'a')
            for x in range(0, 10):
                f.write('line ' + str(x) + '\n')
            f.close()
            with open(landing_zone + '/badfile.txt', 'rb') as f_in, \
                    gzip.open(landing_zone + '/badfile.txt.gz', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        try:
            os.utime(landing_zone + '/badfile.txt', None)
        except OSError:
            open(landing_zone + '/badfile.txt', 'a').close()

        try:
            os.utime(landing_zone + '/sample.txt', None)
        except OSError:
            open(landing_zone + '/sample.txt', 'a').close()

        try:
            os.utime(landing_zone + '/sandbox.txt', None)
        except OSError:
            open(landing_zone + '/sandbox.txt', 'a').close()

        try:
            hdfs.rmr(hdfs.path.expanduser("~") + '/data/none/test')
        except IOError:
            pass

        try:
            if hdfs.path.exists(
                    hdfs.path.expanduser("~") + '/data/duplicate/'):
                hdfs.rmr(hdfs.path.expanduser("~") + '/data/duplicate/')
        except IOError:
            pass

        try:
            if hdfs.path.exists(hdfs.path.expanduser("~") + '/data/spam/'):
                hdfs.rmr(hdfs.path.expanduser("~") + '/data/spam/')
        except IOError:
            pass
        try:
            if hdfs.path.exists(
                    hdfs.path.expanduser("~") +
                    '/data/sandbox/bria644/sandboxFile/sandbox.txt'):
                hdfs.rmr(
                    hdfs.path.expanduser("~") +
                    '/data/sandbox/bria644/sandboxFile/sandbox.txt')
        except IOError:
            pass
示例#45
0
 def stat(self):
     if hdfs.default_is_local():
         return
     bn = '%s%s' % (make_random_str(), UNI_CHR)
     fn = '/user/%s/%s' % (DEFAULT_USER, bn)
     fs = hdfs.hdfs("default", 0)
     p = "hdfs://%s:%s%s" % (fs.host, fs.port, fn)
     with fs.open_file(fn, 'w') as fo:
         fo.write(make_random_str())
     info = fs.get_path_info(fn)
     fs.close()
     s = hdfs.path.stat(p)
     for n1, n2 in self.NMAP.iteritems():
         attr = getattr(s, n1, None)
         self.assertFalse(attr is None)
         self.assertEqual(attr, info[n2])
     self.__check_extra_args(s, info)
     self.__check_wrapper_funcs(p)
     hdfs.rmr(p)
示例#46
0
 def run(self):
     exit_code = 1
     with tempfile.NamedTemporaryFile() as f:
         self.log.debug("opened scratch MR job input file %s", f.name)
         # We write the files to be compressed to a temporary file.  Later we'll re-read
         # this temporary file to rename the files as well.  I've opted not to keep the
         # table in memory in the hope of scaling better to jobs with a large number of
         # files (we reduce memory requirements).
         num_files = self.__write_mr_input(f)
         f.flush()
         self.log.debug("Finished writing temp input file")
         input_filename = tempfile.mktemp(dir=os.path.dirname(self.output_path), prefix="dist_txt_zipper_input")
         tmpfile_uri = "file://%s" % f.name
         try:
             self.log.debug("copying input from %s to %s", tmpfile_uri, input_filename)
             hdfs.cp(tmpfile_uri, input_filename)
             self.log.info("Run analyzed.  Launching distributed job")
             # launch mr task
             pydoop_args = \
                 [ 'script', '--num-reducers', '0','--kv-separator', '',
                   '-Dmapred.map.tasks=%d' % num_files,
                   '-Dmapred.input.format.class=org.apache.hadoop.mapred.lib.NLineInputFormat',
                   '-Dmapred.line.input.format.linespermap=1',
                   '-Dmapred.output.compress=true',
                   '-Dmapred.output.compression.codec=%s' % 'org.apache.hadoop.io.compress.GzipCodec',
                   text_zipper_mr.__file__,
                   input_filename,
                   self.output_path]
             self.log.debug("pydoop_args: %s", pydoop_args)
             self.log.info("Compressing %s files", num_files)
             pydoop_app.main(pydoop_args)
             self.log.info("Distributed job complete")
             self.rename_compressed_files(f)
             self.log.info("finished")
             exit_code = 0
         finally:
             try:
                 self.log.debug("Removing temporary input file %s", input_filename)
                 hdfs.rmr(input_filename)
             except IOError as e:
                 self.log.warning("Problem cleaning up.  Error deleting temporary input file %s", input_filename)
                 self.log.exception(str(e))
         return exit_code
示例#47
0
 def stat(self):
     if hdfs.default_is_local():
         return
     bn = '%s%s' % (make_random_str(), UNI_CHR)
     fn = '/user/%s/%s' % (DEFAULT_USER, bn)
     fs = hdfs.hdfs("default", 0)
     p = "hdfs://%s:%s%s" % (fs.host, fs.port, fn)
     with fs.open_file(fn, 'w') as fo:
         fo.write(make_random_str())
     info = fs.get_path_info(fn)
     fs.close()
     s = hdfs.path.stat(p)
     for n1, n2 in self.NMAP.iteritems():
         attr = getattr(s, n1, None)
         self.assertFalse(attr is None)
         self.assertEqual(attr, info[n2])
     self.__check_extra_args(s, info)
     self.__check_wrapper_funcs(p)
     hdfs.rmr(p)
示例#48
0
  def execute(self, logger, env=None):
    """
    Executes the command.

    This method calls self.command to build the command array and then executes
    the command.  If provided, the specified `env` will be used.
    """
    cmd = self.command(env)
    logger.debug("attempting to remove output path %s", self.output_str)
    try:
      phdfs.rmr(self.output_str)
    except IOError as e:
      logger.warning(e)

    if not phdfs.path.exists(phdfs.path.dirname(self.output_str)):
      phdfs.mkdir(phdfs.path.dirname(self.output_str))
      logger.debug("Created parent of output directory")

    logger.info("Executing command: %s", cmd)
    logger.debug("PATH: %s", (env or os.environ).get('PATH'))
    subprocess.check_call(cmd, env=env)
示例#49
0
    def clean_up(self):
        if sys.argv.count('--no-cleanup') > 0:
            self.logger.warn("User specified --no-cleanup.  Not deleting temporary files")
            self.logger.warn("output dir: %s", self.output_dir)
            self.logger.warn("hdfs input path: %s", self.make_hdfs_input_path())
            self.logger.warn("hdfs output path: %s", self.make_hdfs_output_path())
            self.logger.warn("hdfs test path: %s", self.make_hdfs_test_path())
            return

        self.rm_output_dir()
        try:
            hdfs.rmr(self.make_hdfs_input_path())
        except Exception as e:
            self.logger.warning(e)
            pass

        try:
            hdfs.rmr(self.make_hdfs_output_path())
        except Exception as e:
            self.logger.warning(e)
            pass

        try:
            hdfs.rmr(self.make_hdfs_test_path())
        except Exception as e:
            self.logger.warning(e)
            pass
def runSparkClassifyHijackingUniquePrefixDuration(sc, dataset, ip_type):
    readPath = "/spark-hdfs-path/rpki/results/rpki-enabled-unique-prefix-asn-%s/%s" % (
        ip_type, dataset)
    savePath = "/spark-hdfs-path/rpki/results/rpki-enabled-unique-prefix-classify-hijack-duration-%s/%s" % (
        ip_type, dataset)
    localPath = "/local-spark-result-path/research/rpki/src/spark/results/rpki-unique-prefix-classify-hijack-duration-%s/%s" % (
        ip_type, dataset)

    try:
        hdfs.rmr(savePath)
    except:
        pass


    k = sc.textFile(readPath)\
            .map(lambda v: parseVerifyLineUniquePrefix(v))\
            .filter(lambda v: v is not None)\
            .filter(lambda v: notDataError(dataset, v))\
            .filter(lambda v: isIPv4v6(v, ip_type))\
            .filter(lambda v: classifyBGPAdvSparse(v) == "rpki-invalid")\
            .filter(lambda v: ip_type == "ipv6" or not isLargerSlash24(v))\
            .filter(lambda v: onlyHijackAttempt(v))\
            .map(lambda v: ( (classifyHijack(v), v['prefix_addr'], v['prefix_len'], v['origin_as']), v['time']))\
            .groupByKey()\
            .map(lambda ((classifyHijack, prefix_addr, prefix_len, origin), list_of_time): (classifyHijack, prefix_addr, prefix_len, origin, len(set(list_of_time))))\
            .map(toTSV)\
            .saveAsTextFile(savePath)

    try:
        shutil.rmtree(localPath)
    except:
        pass

    try:
        os.makedirs(localPath)
    except:
        pass

    hdfs.get(savePath, localPath)
    mergeAndSort(localPath)
示例#51
0
def rmr(hdfs_path, project=None):
    """
    Recursively remove files and directories.

    Args:
        :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to project_name in HDFS).
        :project: If the supplied hdfs_path is a relative path, it will look for that file in this project's subdir in HDFS.

    """
    if project == None:
        project = project_name()
    hdfs_path = _expand_path(hdfs_path, project)
    return hdfs.rmr(hdfs_path)
示例#52
0
 def __cp_recursive(self, wd):
     src_t = self.__make_tree(wd)
     src = src_t.name
     copy_on_wd = "%s_copy" % src
     src_bn, copy_on_wd_bn = [
         hdfs.path.basename(d) for d in (src, copy_on_wd)
     ]
     hdfs.cp(src, copy_on_wd, mode="wb")
     exp_t = self.__make_tree(wd, root=copy_on_wd_bn, create=False)
     for t, exp_t in czip(src_t.walk(), exp_t.walk()):
         self.assertTrue(hdfs.path.exists(exp_t.name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(exp_t.name), self.data)
     # check semantics when target dir already exists
     hdfs.rmr(copy_on_wd)
     hdfs.mkdir(copy_on_wd)
     hdfs.cp(src, copy_on_wd, mode="wb")
     exp_t = self.__make_tree(copy_on_wd, root=src_bn, create=False)
     for t, exp_t in czip(src_t.walk(), exp_t.walk()):
         self.assertTrue(hdfs.path.exists(exp_t.name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(exp_t.name), self.data)
示例#53
0
文件: test_hdfs.py 项目: crs4/pydoop
 def __cp_recursive(self, wd):
     src_t = self.__make_tree(wd)
     src = src_t.name
     copy_on_wd = "%s_copy" % src
     src_bn, copy_on_wd_bn = [
         hdfs.path.basename(d) for d in (src, copy_on_wd)
     ]
     hdfs.cp(src, copy_on_wd, mode="wb")
     exp_t = self.__make_tree(wd, root=copy_on_wd_bn, create=False)
     for t, exp_t in czip(src_t.walk(), exp_t.walk()):
         self.assertTrue(hdfs.path.exists(exp_t.name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(exp_t.name), self.data)
     # check semantics when target dir already exists
     hdfs.rmr(copy_on_wd)
     hdfs.mkdir(copy_on_wd)
     hdfs.cp(src, copy_on_wd, mode="wb")
     exp_t = self.__make_tree(copy_on_wd, root=src_bn, create=False)
     for t, exp_t in czip(src_t.walk(), exp_t.walk()):
         self.assertTrue(hdfs.path.exists(exp_t.name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(exp_t.name), self.data)
示例#54
0
def main(args):
    logger.setLevel(logging.DEBUG)

    options = parse_args(args)
    logger.setLevel(options.log_level)

    logger.info("Running workflow with the following configuration")
    logger.info("n_nodes: %d", options.n_nodes)
    logger.info("bcl converter jar %s", options.jar_path)
    logger.info("Other conf:\n%s", GlobalConf)

    start_time = time.time()
    try:
        if options.skip_bcl:
            logger.info("Skipping bcl conversion as requested")
            tmp_output_dir = options.input
        else:
            tmp_output_dir = mk_hdfs_temp_dir('bcl_output_')
            logger.debug("Temporary output directory on HDFS: %s", tmp_output_dir)
            run_bcl_converter(options.input, tmp_output_dir, options.n_nodes, options.jar_path)
        time_after_bcl = time.time()
        run_alignments(tmp_output_dir, options.output)
        time_after_align = time.time()
    finally:
        if options.keep_intermediate:
            logger.info("Leaving intermediate data in directory %s", tmp_output_dir)
        elif not options.skip_bcl: # if we skipped bcl, tmp_conf_dir is the input directory
            try:
                phdfs.rmr(tmp_output_dir)
            except StandardError as e:
                logger.error("Error while trying to remove temporary output directory %s", tmp_output_dir)
                logger.exception(e)

    finish_time = time.time()
    logger.info("Seconds for bcl conversion:  %0.2f", (time_after_bcl - start_time))
    logger.info("Seconds for alignment:  %0.2f", (time_after_align - time_after_bcl))
    logger.info("Total execution time:  %0.2f", (finish_time - start_time))
示例#55
0
    def run(self):
        pydoop_exec = self.find_exec('pydoop')
        if pydoop_exec is None:
            raise RuntimeError("Can't find pydoop executable in PATH")

        with tempfile.NamedTemporaryFile() as f:
            num_records = self.__write_mr_input(f)
            f.flush()
            self.log.debug("Wrote temp input file %s", f.name)
            input_filename = tempfile.mktemp(dir=os.path.dirname(self.output_path), prefix="dist_bcl2qseq_input")
            tmpfile_uri = "file://%s" % f.name
            try:
                self.log.debug("copying input from %s to %s", tmpfile_uri, input_filename)
                hdfs.cp(tmpfile_uri, input_filename)
                self.log.info("Run analyzed.  Launching distributed job")
                # launch mr task
                cmd = [ 'pydoop', 'script', '--num-reducers', '0', '--kv-separator', '',
                        '-Dmapred.map.tasks=%d' % num_records,
                        '-Dmapred.input.format.class=org.apache.hadoop.mapred.lib.NLineInputFormat',
                        '-Dmapred.line.input.format.linespermap=1',
                        bcl2qseq_mr.__file__,
                        input_filename,
                        self.output_path]
                self.log.debug(str(cmd))
                subprocess.check_call(cmd)
                self.log.info("Distributed job complete")
            except subprocess.CalledProcessError as e:
                self.log.exception(e)
                self.log.error("Error running pydoop script component")
                raise
            finally:
                try:
                    hdfs.rmr(input_filename)
                except IOError as e:
                    self.log.debug("Problem cleaning up.  Error deleting temporary input file %s", input_filename)
                    self.log.debug(str(e))
def clean_empty_dirs(remote_basedir):
    LOGGER = logging.getLogger(__name__)
    deleted_dirs = []
    ## Directory structure is {remote_basedir}/{year}/{month}
    year_dirs = hdfs.ls(remote_basedir)
    # Do an ls to find all month dirs
    for year_dir in year_dirs:
        month_dirs = hdfs.ls(hdfs.path.join(remote_basedir, year_dir))
        # Check to see if month dirs are empty
        month_dirs_deleted = 0
        for month_dir in month_dirs:
            files = hdfs.ls(hdfs.path.join(remote_basedir, year_dir, month_dir))
            if not files:
                LOGGER.debug("Directory {0} is empty, deleting it".format(month_dir))
                hdfs.rmr(month_dir)
                deleted_dirs.append(month_dir)
                month_dirs_deleted += 1

        if month_dirs_deleted == len(month_dirs):
            # Deleted all month sub-directories, so delete year directory too
            LOGGER.debug("Directory {0} is empty, deleting it".format(year_dir))
            hdfs.rmr(year_dir)
            deleted_dirs.append(year_dir)
    return deleted_dirs
示例#57
0
文件: hadut.py 项目: crs4/pydoop
 def clean(self):
     """
     Remove the working directory, if any.
     """
     if self.wd:
         hdfs.rmr(self.wd)
示例#58
0
 def samefile_rel(self):
     p = make_random_str() + UNI_CHR
     hdfs.dump("foo\n", p)
     self.assertTrue(hdfs.path.samefile(p, hdfs.path.abspath(p)))
     hdfs.rmr(p)
示例#59
0
 def tearDown(self):
     hdfs.rmr(self.path)
示例#60
0
 def good(self):
   path = utils.make_random_str()
   hdfs.dump("foo\n", path)
   self.assertTrue(hdfs.path.exists(path))
   hdfs.rmr(path)
   self.assertFalse(hdfs.path.exists(path))