Python mkdir 예제들, pydoop.hdfs.mkdir Python 예제들

예제 #1

0

파일 보기

    def download(self):
        tar_name = self.url.rsplit("/", 1)[-1]

        def _report(count, block_size, total_size):
            perc = 100 * count * block_size / total_size
            sys.stdout.write("\r>> Getting %s %.1f%%" % (tar_name, perc))
            sys.stdout.flush()

        tempd = tempfile.mkdtemp(prefix="pydeep_")
        tar_path = os.path.join(tempd, tar_name)
        tar_path, _ = urllib.request.urlretrieve(self.url, tar_path, _report)
        print()
        dest_dir = hdfs.path.dirname(self.path)
        if dest_dir:
            hdfs.mkdir(dest_dir)
        with tarfile.open(tar_path, "r:gz") as tar:
            try:
                info = tar.getmember(self.filename)
            except KeyError:
                raise ValueError("{} not found in {}".format(
                    self.filename, tar_name))
            f_in = tar.extractfile(info)
            with hdfs.open(self.path, "wb") as f_out:
                while True:
                    chunk = f_in.read(PAGESIZE)
                    if not chunk:
                        break
                    f_out.write(chunk)
        shutil.rmtree(tempd)

예제 #2

0

파일 보기

파일: workflow.py 프로젝트: ilveroluca/flink-pipeline

def run_alignments(bcl_output_dir, output_dir):
    sample_directories = _get_samples_from_bcl_output(bcl_output_dir)
    logger.info("Found %d samples in bcl output directory", len(sample_directories))
    logger.debug("Making base output directory %s", output_dir)
    phdfs.mkdir(output_dir)
    # launch all the jobs
    base_cmd = [
            get_exec('seal'), 'seqal', '--align-only',
            '-D', 'seal.seqal.nthreads={:d}'.format(GlobalConf['seqal_nthreads']),
            '-D', 'mapreduce.map.cpu.vcores={:d}'.format(GlobalConf['seqal_yarn_cores']),
            '--input-format', GlobalConf.get('seqal_input_fmt', 'prq'),
            '--output-format', GlobalConf.get('seqal_output_fmt', 'sam'),
            '--ref-archive', GlobalConf['reference_archive'],
        ]
    def start_job(sample_dir):
        sample_output_dir = phdfs.path.join(output_dir, os.path.basename(sample_dir))
        cmd = base_cmd + [ sample_dir, sample_output_dir ]
        # LP: should refactor to start the job within the AlignJob object
        job = AlignJob(cmd=cmd, inputp=sample_dir, outputp=sample_output_dir)
        logger.info("Launching alignment of sample %s", os.path.basename(sample_dir))
        logger.debug("executing command: %s", cmd)
        job.popen_obj = subprocess.Popen(map(str, cmd), bufsize=4096)
        job.popen_obj.poll()
        logger.debug("job running with PID %d", job.popen_obj.pid)
        return job

    jobs = [ start_job(s) for s in sample_directories ]
    ok = _wait(jobs, GlobalConf['remove_output'])
    if not ok:
        errored_jobs = [ j for j in jobs if j.failed ]
        logger.error("%d alignment jobs failed", len(errored_jobs))
        logger.error("Here are the return codes: %s", ', '.join([ str(j.retcode) for j in errored_jobs ]))
        raise RuntimeError("Some alignment jobs failed")

예제 #3

0

파일 보기

파일: HdfsIO.py 프로젝트: yuyiguo/WMArchive

    def _write(self, data):
        "Internal Write API"
        schema = self.schema
        wmaid = self.wmaid(data)
        year, month, _ = today()
        hdir = '%s/%s/%s' % (self.hdir, year, month)
        if  not hdfs.path.isdir(hdir):
            hdfs.mkdir(hdir)
        fname = file_name(hdir, wmaid, self.compress)

        # create Avro writer and binary encoder
        writer = avro.io.DatumWriter(schema)
        bytes_writer = io.BytesIO()

        if  self.compress:
            # use gzip'ed writer with BytesIO file object
            gzip_writer = gzip.GzipFile(fileobj=bytes_writer, mode='wb')
            encoder = avro.io.BinaryEncoder(gzip_writer)
        else:
            # plain binary reader
            encoder = avro.io.BinaryEncoder(bytes_writer)

        # write records from given data stream to binary writer
        writer.write(data, encoder)

        # close gzip stream if necessary
        if  self.compress:
            gzip_writer.flush()
            gzip_writer.close()

        # store raw data to hadoop via HDFS
        hdfs.dump(bytes_writer.getvalue(), fname)

        # close bytes stream
        bytes_writer.close()

예제 #4

0

파일 보기

파일: util.py 프로젝트: ilveroluca/flink-pipeline

def mk_hdfs_temp_dir(prefix):
    found = True
    while found:
        tmp = os.path.basename(tempfile.mktemp(prefix=prefix))
        found = phdfs.path.exists(tmp)
    phdfs.mkdir(tmp)
    return tmp

예제 #5

0

파일 보기

파일: submit.py 프로젝트: kikkomep/pydoop

    def __setup_remote_paths(self):
        """
        Actually create the working directory and copy the module into it.

        Note: the script has to be readable by Hadoop; though this may not
        generally be a problem on HDFS, where the Hadoop user is usually
        the superuser, things may be different if our working directory is
        on a shared POSIX filesystem.  Therefore, we make the directory
        and the script accessible by all.
        """
        self.logger.debug("remote_wd: %s", self.remote_wd)
        self.logger.debug("remote_exe: %s", self.remote_exe)
        self.logger.debug("remotes: %s", self.files_to_upload)
        if self.args.module:
            self.logger.debug(
                'Generated pipes_code:\n\n %s', self._generate_pipes_code()
            )
        if not self.args.pretend:
            hdfs.mkdir(self.remote_wd)
            hdfs.chmod(self.remote_wd, "a+rx")
            self.logger.debug("created and chmod-ed: %s", self.remote_wd)
            pipes_code = self._generate_pipes_code()
            hdfs.dump(pipes_code, self.remote_exe)
            self.logger.debug("dumped pipes_code to: %s", self.remote_exe)
            hdfs.chmod(self.remote_exe, "a+rx")
            self.__warn_user_if_wd_maybe_unreadable(self.remote_wd)
            for (l, h, _) in self.files_to_upload:
                self.logger.debug("uploading: %s to %s", l, h)
                hdfs.cp(l, h)
        self.logger.debug("Created%sremote paths:" %
                          (' [simulation] ' if self.args.pretend else ' '))

예제 #6

0

파일 보기

    def _write(self, data):
        "Internal Write API"
        schema = self.schema
        wmaid = self.wmaid(data)
        year, month, _ = today()
        hdir = '%s/%s/%s' % (self.hdir, year, month)
        if not hdfs.path.isdir(hdir):
            hdfs.mkdir(hdir)
        fname = file_name(hdir, wmaid, self.compress)

        # create Avro writer and binary encoder
        writer = avro.io.DatumWriter(schema)
        bytes_writer = io.BytesIO()

        if self.compress:
            # use gzip'ed writer with BytesIO file object
            gzip_writer = gzip.GzipFile(fileobj=bytes_writer, mode='wb')
            encoder = avro.io.BinaryEncoder(gzip_writer)
        else:
            # plain binary reader
            encoder = avro.io.BinaryEncoder(bytes_writer)

        # write records from given data stream to binary writer
        writer.write(data, encoder)

        # close gzip stream if necessary
        if self.compress:
            gzip_writer.flush()
            gzip_writer.close()

        # store raw data to hadoop via HDFS
        hdfs.dump(bytes_writer.getvalue(), fname)

        # close bytes stream
        bytes_writer.close()

예제 #7

0

파일 보기

파일: submit.py 프로젝트: muhammadyaseen/pydoop

    def __setup_remote_paths(self):
        """
        Actually create the working directory and copy the module into it.

        Note: the script has to be readable by Hadoop; though this may not
        generally be a problem on HDFS, where the Hadoop user is usually
        the superuser, things may be different if our working directory is
        on a shared POSIX filesystem.  Therefore, we make the directory
        and the script accessible by all.
        """
        self.logger.debug("remote_wd: %s", self.remote_wd)
        self.logger.debug("remote_exe: %s", self.remote_exe)
        self.logger.debug("remotes: %s", self.files_to_upload)
        if self.args.module:
            self.logger.debug('Generated pipes_code:\n\n %s',
                              self._generate_pipes_code())
        if not self.args.pretend:
            hdfs.mkdir(self.remote_wd)
            hdfs.chmod(self.remote_wd, "a+rx")
            self.logger.debug("created and chmod-ed: %s", self.remote_wd)
            pipes_code = self._generate_pipes_code()
            hdfs.dump(pipes_code, self.remote_exe)
            self.logger.debug("dumped pipes_code to: %s", self.remote_exe)
            hdfs.chmod(self.remote_exe, "a+rx")
            self.__warn_user_if_wd_maybe_unreadable(self.remote_wd)
            for (l, h, _) in self.files_to_upload:
                self.logger.debug("uploading: %s to %s", l, h)
                hdfs.cp(l, h)
        self.logger.debug("Created%sremote paths:" %
                          (' [simulation] ' if self.args.pretend else ' '))

예제 #8

0

파일 보기

파일: migrate2hdfs.py 프로젝트: stiegerb/WMArchive

def hdfs_file(odir, name):
    """
    Given HDFS dir and file name create appropriate dir structure on HDFS
    and return full path of the file. We rely on odir/YYYY/MM/DD dir structure.
    """
    tstamp = name.split('/')[-1].split('_')[
        0]  # each file is in form YYYYMMDD_HHMM.ext
    if not PAT_YYYYMMDD.match(tstamp):
        raise Exception(
            "Given file name '%s' does not contain YYYYMMDD stamp" % name)
    year = tstamp[:4]
    if not PAT_YYYY.match(year):
        raise Exception("Given file name '%s' does not contain YYYY stamp" %
                        name)
    month = tstamp[4:6]
    if not PAT_MM.match(month):
        raise Exception("Given file name '%s' does not contain MM stamp" %
                        name)
    day = tstamp[6:8]
    if not PAT_DD.match(day):
        raise Exception("Given file name '%s' does not contain DD stamp" %
                        name)
    if not hdfs.path.isdir(odir):
        hdfs.mkdir(odir)
    for subdir in [year, month, day]:
        odir = os.path.join(odir, subdir)
        if not hdfs.path.isdir(odir):
            hdfs.mkdir(odir)
    return os.path.join(odir, name)

예제 #9

0

파일 보기

파일: test_hdfs.py 프로젝트: ZEMUSHKA/pydoop

 def mkdir(self):
   for wd in self.local_wd, self.hdfs_wd:
     d1 = "%s/d1" % wd
     d2 = "%s/d2" % d1
     hdfs.mkdir(d2)
     dir_list = hdfs.ls(d1)
     self.assertEqual(len(dir_list), 1)
     self.assertTrue(dir_list[0].endswith(d2))

예제 #10

0

파일 보기

 def mkdir(self):
     for wd in self.local_wd, self.hdfs_wd:
         d1 = "%s/d1" % wd
         d2 = "%s/d2" % d1
         hdfs.mkdir(d2)
         dir_list = hdfs.ls(d1)
         self.assertEqual(len(dir_list), 1)
         self.assertTrue(dir_list[0].endswith(d2))

예제 #11

0

파일 보기

파일: hadut.py 프로젝트: onlynone/pydoop

 def __init__(self, prefix=None, logger=None):
     self.wd = self.exe = self.input = self.output = None
     self.logger = logger or utils.NullLogger()
     if prefix:
         self.wd = utils.make_random_str(prefix=prefix)
         hdfs.mkdir(self.wd)
         for n in "input", "output":
             setattr(self, n, hdfs.path.join(self.wd, n))

예제 #12

0

파일 보기

파일: hadut.py 프로젝트: tivvit/pydoop

 def __init__(self, prefix=None, logger=None):
     self.wd = self.exe = self.input = self.output = None
     self.logger = logger or utils.NullLogger()
     if prefix:
         self.wd = utils.make_random_str(prefix=prefix)
         hdfs.mkdir(self.wd)
         for n in "input", "output":
             setattr(self, n, hdfs.path.join(self.wd, n))

예제 #13

0

파일 보기

def mk_hdfs_temp_dir(prefix):
    if not pydoop_here:
        raise NotImplementedError("Pydoop not available on this system")
    found = True
    while found:
        tmp = os.path.basename(tempfile.mktemp(prefix=prefix))
        found = phdfs.path.exists(tmp)
    phdfs.mkdir(tmp)
    return tmp

예제 #14

0

파일 보기

파일: test_hdfs.py 프로젝트: ZEMUSHKA/pydoop

 def __cp_dir(self, wd):
   src_dir = "%s/src_dir" % wd
   hdfs.mkdir(src_dir)
   copy_on_wd = "%s/src_dir_copy" % wd
   copy_on_copy_on_wd = "%s/src_dir" % copy_on_wd
   hdfs.cp(src_dir, copy_on_wd)
   self.assertTrue(hdfs.path.exists(copy_on_wd))
   hdfs.cp(src_dir, copy_on_wd)
   self.assertTrue(hdfs.path.exists(copy_on_copy_on_wd))
   self.assertRaises(IOError, hdfs.cp, src_dir, copy_on_wd)

예제 #15

0

파일 보기

 def __cp_dir(self, wd):
     src_dir = "%s/src_dir" % wd
     hdfs.mkdir(src_dir)
     copy_on_wd = "%s/src_dir_copy" % wd
     copy_on_copy_on_wd = "%s/src_dir" % copy_on_wd
     hdfs.cp(src_dir, copy_on_wd, mode="wb")
     self.assertTrue(hdfs.path.exists(copy_on_wd))
     hdfs.cp(src_dir, copy_on_wd, mode="wb")
     self.assertTrue(hdfs.path.exists(copy_on_copy_on_wd))
     self.assertRaises(IOError, hdfs.cp, src_dir, copy_on_wd)

예제 #16

0

파일 보기

파일: test_hdfs.py 프로젝트: ZEMUSHKA/pydoop

 def __make_tree(self, wd):
   d1 = "%s/d1" % wd
   t1 = FSTree(d1)
   d2 = "%s/d2" % d1
   t2 = t1.add(d2)
   hdfs.mkdir(d2)
   for t, d, bn in ((t1, d1, "f1"), (t2, d2, "f2")):
     f = "%s/%s" % (d, bn)
     hdfs.dump(self.data, f)
     t.add(f, 0)
   return t1

예제 #17

0

파일 보기

파일: test_hdfs.py 프로젝트: xuande/pydoop

 def __make_tree(self, wd):
     d1 = "%s/d1" % wd
     t1 = FSTree(d1)
     d2 = "%s/d2" % d1
     t2 = t1.add(d2)
     hdfs.mkdir(d2)
     for t, d, bn in ((t1, d1, "f1"), (t2, d2, "f2")):
         f = "%s/%s" % (d, bn)
         hdfs.dump(self.data, f)
         t.add(f, 0)
     return t1

예제 #18

0

파일 보기

파일: hadut.py 프로젝트: crs4/pydoop

    def __init__(self, prefix=None, logger=None):
        hadoop_version_info = pydoop.hadoop_version_info()
        if hadoop_version_info.is_local():
            raise pydoop.LocalModeNotSupported()

        self.wd = self.exe = self.input = self.output = None
        self.logger = logger or utils.NullLogger()
        if prefix:
            self.wd = utils.make_random_str(prefix=prefix)
            hdfs.mkdir(self.wd)
            for n in "input", "output":
                setattr(self, n, hdfs.path.join(self.wd, n))

예제 #19

0

파일 보기

파일: hadut.py 프로젝트: xuande/pydoop

    def __init__(self, prefix=None, logger=None):
        hadoop_version_info = pydoop.hadoop_version_info()
        if hadoop_version_info.is_local():
            raise pydoop.LocalModeNotSupported()

        self.wd = self.exe = self.input = self.output = None
        self.logger = logger or utils.NullLogger()
        if prefix:
            self.wd = utils.make_random_str(prefix=prefix)
            hdfs.mkdir(self.wd)
            for n in "input", "output":
                setattr(self, n, hdfs.path.join(self.wd, n))

예제 #20

0

파일 보기

파일: test_hdfs.py 프로젝트: ZEMUSHKA/pydoop

 def __cp_file(self, wd):
   fn = "%s/fn" % wd
   hdfs.dump(self.data, fn)
   dest_dir = "%s/dest_dir" % wd
   hdfs.mkdir(dest_dir)
   fn_copy_on_wd = "%s/fn_copy" % wd
   hdfs.cp(fn, fn_copy_on_wd)
   self.assertEqual(hdfs.load(fn_copy_on_wd), self.data)
   self.assertRaises(IOError, hdfs.cp, fn, fn_copy_on_wd)
   fn_copy_on_dest_dir = "%s/fn" % dest_dir
   hdfs.cp(fn, dest_dir)
   self.assertEqual(hdfs.load(fn_copy_on_dest_dir), self.data)
   self.assertRaises(IOError, hdfs.cp, fn, dest_dir)

예제 #21

0

파일 보기

파일: features.py 프로젝트: manics/pydoop-features

def mapper(_, record, writer, conf):
    out_dir = conf.get('out.dir', utils.make_random_str())
    if not hdfs.path.isdir(out_dir):
        hdfs.mkdir(out_dir)
        hdfs.chmod(out_dir, 'g+rwx')
    img_path = record.strip()
    a = get_array(img_path)
    out_a = calc_features(a)
    out_path = hdfs.path.join(out_dir, '%s.out' % hdfs.path.basename(img_path))
    with hdfs.open(out_path, 'w') as fo:
        np.save(fo, out_a)  # actual output
    hdfs.chmod(out_path, 'g+rw')
    writer.emit(img_path, fo.name)  # info (tab-separated input-output)

예제 #22

0

파일 보기

 def __cp_file(self, wd):
     fn = "%s/fn" % wd
     hdfs.dump(self.data, fn, mode="wb")
     dest_dir = "%s/dest_dir" % wd
     hdfs.mkdir(dest_dir)
     fn_copy_on_wd = "%s/fn_copy" % wd
     hdfs.cp(fn, fn_copy_on_wd, mode="wb")
     self.assertEqual(hdfs.load(fn_copy_on_wd), self.data)
     self.assertRaises(IOError, hdfs.cp, fn, fn_copy_on_wd)
     fn_copy_on_dest_dir = "%s/fn" % dest_dir
     hdfs.cp(fn, dest_dir, mode="wb")
     self.assertEqual(hdfs.load(fn_copy_on_dest_dir), self.data)
     self.assertRaises(IOError, hdfs.cp, fn, dest_dir)

예제 #23

0

파일 보기

 def test_isdir(self):
     for path in self.path, self.u_path:
         self.assertFalse(hdfs.path.isdir(path))
         try:
             hdfs.dump("foo\n", path)
             self.assertFalse(hdfs.path.isdir(path))
             hdfs.rmr(path)
             hdfs.mkdir(path)
             self.assertTrue(hdfs.path.isdir(path))
         finally:
             try:
                 hdfs.rmr(path)
             except IOError:
                 pass

예제 #24

0

파일 보기

파일: test_path.py 프로젝트: onlynone/pydoop

 def test_isdir(self):
   path = utils.make_random_str()
   self.assertFalse(hdfs.path.isdir(path))
   try:
     hdfs.dump("foo\n", path)
     self.assertFalse(hdfs.path.isdir(path))
     hdfs.rmr(path)
     hdfs.mkdir(path)
     self.assertTrue(hdfs.path.isdir(path))
   finally:
     try:
       hdfs.rmr(path)
     except IOError:
       pass

예제 #25

0

파일 보기

파일: test_path.py 프로젝트: onlynone/pydoop

 def test_kind(self):
   path = utils.make_random_str()
   self.assertTrue(hdfs.path.kind(path) is None)
   try:
     hdfs.dump("foo\n", path)
     self.assertEqual('file', hdfs.path.kind(path))
     hdfs.rmr(path)
     hdfs.mkdir(path)
     self.assertEqual('directory', hdfs.path.kind(path))
   finally:
     try:
       hdfs.rmr(path)
     except IOError:
       pass

예제 #26

0

파일 보기

 def test_kind(self):
     for path in self.path, self.u_path:
         self.assertTrue(hdfs.path.kind(path) is None)
         try:
             hdfs.dump("foo\n", path)
             self.assertEqual('file', hdfs.path.kind(path))
             hdfs.rmr(path)
             hdfs.mkdir(path)
             self.assertEqual('directory', hdfs.path.kind(path))
         finally:
             try:
                 hdfs.rmr(path)
             except IOError:
                 pass

예제 #27

0

파일 보기

파일: test_path.py 프로젝트: ilveroluca/pydoop

 def test_kind(self):
   path = utils.make_random_str()
   self.assertTrue(hdfs.path.kind(path) is None)
   try:
     hdfs.dump("foo\n", path)
     self.assertEqual('file', hdfs.path.kind(path))
     hdfs.rmr(path)
     hdfs.mkdir(path)
     self.assertEqual('directory', hdfs.path.kind(path))
   finally:
     try:
       hdfs.rmr(path)
     except IOError:
       pass

예제 #28

0

파일 보기

파일: test_path.py 프로젝트: kikkomep/pydoop

 def test_isdir(self):
     for path in self.path, self.u_path:
         self.assertFalse(hdfs.path.isdir(path))
         try:
             hdfs.dump("foo\n", path)
             self.assertFalse(hdfs.path.isdir(path))
             hdfs.rmr(path)
             hdfs.mkdir(path)
             self.assertTrue(hdfs.path.isdir(path))
         finally:
             try:
                 hdfs.rmr(path)
             except IOError:
                 pass

예제 #29

0

파일 보기

파일: test_path.py 프로젝트: ilveroluca/pydoop

 def test_isdir(self):
   path = utils.make_random_str()
   self.assertFalse(hdfs.path.isdir(path))
   try:
     hdfs.dump("foo\n", path)
     self.assertFalse(hdfs.path.isdir(path))
     hdfs.rmr(path)
     hdfs.mkdir(path)
     self.assertTrue(hdfs.path.isdir(path))
   finally:
     try:
       hdfs.rmr(path)
     except IOError:
       pass

예제 #30

0

파일 보기

파일: test_path.py 프로젝트: kikkomep/pydoop

 def test_kind(self):
     for path in self.path, self.u_path:
         self.assertTrue(hdfs.path.kind(path) is None)
         try:
             hdfs.dump("foo\n", path)
             self.assertEqual('file', hdfs.path.kind(path))
             hdfs.rmr(path)
             hdfs.mkdir(path)
             self.assertEqual('directory', hdfs.path.kind(path))
         finally:
             try:
                 hdfs.rmr(path)
             except IOError:
                 pass

예제 #31

0

파일 보기

파일: hdfsstorage.py 프로젝트: kattyjoy/codecollect

    def __init__(self, file_prefix, loadexist=False, readonly=False):
        CustomStorage.__init__(self)
        if not loadexist:
            if hdfs.path.exists('{0}_0'.format(file_prefix)):
                file_prefix += '_0'
            while hdfs.path.exists('{0}_0'.format(file_prefix)):
                insert_index = file_prefix.rfind('_')
                file_prefix = '{0}_{1}'.format(file_prefix[:insert_index], int(file_prefix[insert_index + 1:]) + 1)
        self.file_prefix = file_prefix
        self.read_only = readonly
        self.clear()
        logger.info('init hdfs storage from hdfs file_prefix {0}'.format(self.file_prefix))
        try:
            total_start = timeit.default_timer()
            prefix_split = hdfs.path.splitpath(self.file_prefix)
            folder_path = prefix_split[0]
            real_prefix = prefix_split[1] + '_'
            if not hdfs.path.exists(folder_path):
                hdfs.mkdir(folder_path)

            files_info = hdfs.lsl(folder_path)
            # files_info = hdfs.lsl('{0}_*'.format(self.file_prefix))
            logger.debug('files_info:{0}'.format(files_info))
            sizecount = 0
            for file_info in files_info:
                start_time = timeit.default_timer()
                file_name = hdfs.path.splitpath(file_info['path'])[1]
                if file_name.startswith(real_prefix) and file_info['kind'] == 'file':
                    logger.debug('file info: {0}'.format(file_info))
                    page_id = file_name[len(real_prefix):]
                    if not page_id.isdigit():
                        continue
                    logger.debug('file {0} page id :{1}#'.format(file_info['path'],
                                                                 page_id))
                    # if page_id.isdigit():
                    logger.info('load {0}# page file {1}'.format(page_id,
                                                                 file_info['path']))
                    content = hdfs.load(file_info['path'], mode='r')
                    # logger.debug('{0}# page content:{1}'.format(page_id, content))
                    self.pagedict[int(page_id)] = content
                    logger.debug('{0}# page load complete'.format(page_id))
                    end_time = timeit.default_timer()
                    eval(generate_timer_log_str.format(
                        'load {0} {1} byte'.format(file_name, len(self.pagedict[int(page_id)])),
                        start_time,
                        end_time))
                    sizecount += len(self.pagedict[int(page_id)])
        except IOError, ie:
            logger.debug(traceback.format_exc())

예제 #32

0

파일 보기

def calc_bottlenecks(model, img_map, out_dir):
    projector = tflow.BottleneckProjector(model)
    for in_subd, img_paths in img_map.items():
        cls = hdfs.path.basename(in_subd)
        out_subd = hdfs.path.join(out_dir, cls)
        hdfs.mkdir(out_subd)
        bnecks_path = hdfs.path.join(out_subd, BNECKS_BASENAME)
        LOGGER.info("computing bottlenecks for: %s", cls)
        with hdfs.open(bnecks_path, "wb") as out_f:
            for path in img_paths:
                with hdfs.open(path, "rb") as in_f:
                    data = in_f.read()
                    checksum = md5(data).digest()
                    bneck = projector.project(data)
                    out_f.write(checksum + bneck.tobytes())
    projector.close_session()

예제 #33

0

파일 보기

def run_alignments(bcl_output_dir, output_dir):
    sample_directories = _get_samples_from_bcl_output(bcl_output_dir)
    logger.info("Found %d samples in bcl output directory",
                len(sample_directories))
    logger.debug("Making base output directory %s", output_dir)
    phdfs.mkdir(output_dir)
    # launch all the jobs
    base_cmd = [
        get_exec('seal'),
        'seqal',
        '--align-only',
        '-D',
        'seal.seqal.nthreads={:d}'.format(GlobalConf['seqal_nthreads']),
        '-D',
        'mapreduce.map.cpu.vcores={:d}'.format(GlobalConf['seqal_yarn_cores']),
        '--input-format',
        GlobalConf.get('seqal_input_fmt', 'prq'),
        '--output-format',
        GlobalConf.get('seqal_output_fmt', 'sam'),
        '--ref-archive',
        GlobalConf['reference_archive'],
    ]

    def start_job(sample_dir):
        sample_output_dir = phdfs.path.join(output_dir,
                                            os.path.basename(sample_dir))
        cmd = base_cmd + [sample_dir, sample_output_dir]
        # LP: should refactor to start the job within the AlignJob object
        job = AlignJob(cmd=cmd, inputp=sample_dir, outputp=sample_output_dir)
        logger.info("Launching alignment of sample %s",
                    os.path.basename(sample_dir))
        logger.debug("executing command: %s", cmd)
        job.popen_obj = subprocess.Popen(map(str, cmd), bufsize=4096)
        job.popen_obj.poll()
        logger.debug("job running with PID %d", job.popen_obj.pid)
        return job

    jobs = [start_job(s) for s in sample_directories]
    ok = _wait(jobs, GlobalConf['remove_output'])
    if not ok:
        errored_jobs = [j for j in jobs if j.failed]
        logger.error("%d alignment jobs failed", len(errored_jobs))
        logger.error("Here are the return codes: %s",
                     ', '.join([str(j.retcode) for j in errored_jobs]))
        raise RuntimeError("Some alignment jobs failed")

예제 #34

0

파일 보기

 def __make_tree(self, wd, root="d1", create=True):
     """
     d1
     |-- d2
     |   `-- f2
     `-- f1
     """
     d1 = "%s/%s" % (wd, root)
     t1 = FSTree(d1)
     d2 = "%s/d2" % d1
     t2 = t1.add(d2)
     if create:
         hdfs.mkdir(d2)
     for t, d, bn in ((t1, d1, "f1"), (t2, d2, "f2")):
         f = "%s/%s" % (d, bn)
         if create:
             hdfs.dump(self.data, f, mode="wb")
         t.add(f, 0)
     return t1

예제 #35

0

파일 보기

파일: test_hdfs.py 프로젝트: crs4/pydoop

 def __make_tree(self, wd, root="d1", create=True):
     """
     d1
     |-- d2
     |   `-- f2
     `-- f1
     """
     d1 = "%s/%s" % (wd, root)
     t1 = FSTree(d1)
     d2 = "%s/d2" % d1
     t2 = t1.add(d2)
     if create:
         hdfs.mkdir(d2)
     for t, d, bn in ((t1, d1, "f1"), (t2, d2, "f2")):
         f = "%s/%s" % (d, bn)
         if create:
             hdfs.dump(self.data, f, mode="wb")
         t.add(f, 0)
     return t1

예제 #36

0

파일 보기

파일: cam.py 프로젝트: sunminghong/tools

def capture(outpath, max_count='3'):
    """
    fab cam.capture:/tmp/cam1,3
    """
    max_count = int(max_count)
    import os
    import cv2
    import copy
    import pydoop.hdfs as hdfs

    cv2.namedWindow('Window1')
    vc = cv2.VideoCapture()
    vc.open(0)
    skip = 50
    max_count *= skip
    basename = os.path.basename(outpath)
    count = 1
    hdfs.mkdir('hdfs://gnn-f02-01' + outpath)
    while True:
        retval, image = vc.read()
        try:
            if count % skip == 0:
                tmpImage = copy.copy(image)
                filename = '%05d.jpg' % (count / skip)
                hdfspath = 'hdfs://gnn-f02-01%(outpath)s/%(filename)s' % locals(
                )
                cv2.putText(tmpImage, filename, (50, 50),
                            cv2.FONT_HERSHEY_SIMPLEX, 2, 2)
                cv2.imshow('Windows1', tmpImage)
                cv2.waitKey(1)
                cv2.imwrite(basename + '_' + filename, image)
                hdfs.put(basename + '_' + filename, hdfspath)
                print basename + '_' + filename, hdfspath
            else:
                cv2.imshow('Windows1', image)
                cv2.waitKey(1)
        except KeyboardInterrupt:
            break
        count += 1
        if 0 < max_count < count:
            break
    vc.release()
    cv2.destroyWindow('Window1')

예제 #37

0

파일 보기

파일: seal_integration_test.py 프로젝트: ilveroluca/seal

    def setup(self):
        """
        * Creates an hdfs directory with the name of this test (self.make_hdfs_test_path())
        * uploads the local 'input' directory into the hdfs directory
        """
        self.logger.debug("Test setup")
        #hadut.run_hadoop_cmd_e("dfsadmin", args_list=["-safemode", "wait"])
        #self.logger.debug("hdfs out of safe mode")

        if hdfs.path.exists(self.make_hdfs_test_path()):
            error_msg = "hdfs test path '%s' already exists.  Please remove it" % self.make_hdfs_test_path()
            self.logger.fatal(error_msg)
            raise RuntimeError(error_msg)
        hdfs.mkdir(self.make_hdfs_test_path())
        local_input = self.make_local_input_path()
        hdfs_input = self.make_hdfs_input_path()
        hdfs.put(local_input, hdfs_input)
        self.logger.info("Copied local input %s to %s", local_input, hdfs_input)
        self.logger.debug("Setup complete")

예제 #38

0

파일 보기

파일: cam.py 프로젝트: dongjoon-hyun/tools

def capture(outpath, max_count='3'):
    """
    fab cam.capture:/tmp/cam1,3
    """
    max_count = int(max_count)
    import os
    import cv2
    import copy
    import pydoop.hdfs as hdfs

    cv2.namedWindow('Window1')
    vc = cv2.VideoCapture()
    vc.open(0)
    skip = 50
    max_count *= skip
    basename = os.path.basename(outpath)
    count = 1
    hdfs.mkdir('hdfs://gnn-f02-01' + outpath)
    while True:
        retval, image = vc.read()
        try:
            if count % skip == 0:
                tmpImage = copy.copy(image)
                filename = '%05d.jpg' % (count / skip)
                hdfspath = 'hdfs://gnn-f02-01%(outpath)s/%(filename)s' % locals()
                cv2.putText(tmpImage, filename, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 2, 2)
                cv2.imshow('Windows1', tmpImage)
                cv2.waitKey(1)
                cv2.imwrite(basename + '_' + filename, image)
                hdfs.put(basename + '_' + filename, hdfspath)
                print basename + '_' + filename, hdfspath
            else:
                cv2.imshow('Windows1', image)
                cv2.waitKey(1)
        except KeyboardInterrupt:
            break
        count += 1
        if 0 < max_count < count:
            break
    vc.release()
    cv2.destroyWindow('Window1')

예제 #39

0

파일 보기

파일: __init__.py 프로젝트: crs4/hadoop-galaxy

  def execute(self, logger, env=None):
    """
    Executes the command.

    This method calls self.command to build the command array and then executes
    the command.  If provided, the specified `env` will be used.
    """
    cmd = self.command(env)
    logger.debug("attempting to remove output path %s", self.output_str)
    try:
      phdfs.rmr(self.output_str)
    except IOError as e:
      logger.warning(e)

    if not phdfs.path.exists(phdfs.path.dirname(self.output_str)):
      phdfs.mkdir(phdfs.path.dirname(self.output_str))
      logger.debug("Created parent of output directory")

    logger.info("Executing command: %s", cmd)
    logger.debug("PATH: %s", (env or os.environ).get('PATH'))
    subprocess.check_call(cmd, env=env)

예제 #40

0

파일 보기

 def __cp_recursive(self, wd):
     src_t = self.__make_tree(wd)
     src = src_t.name
     copy_on_wd = "%s_copy" % src
     src_bn, copy_on_wd_bn = [
         hdfs.path.basename(d) for d in (src, copy_on_wd)
     ]
     hdfs.cp(src, copy_on_wd, mode="wb")
     exp_t = self.__make_tree(wd, root=copy_on_wd_bn, create=False)
     for t, exp_t in czip(src_t.walk(), exp_t.walk()):
         self.assertTrue(hdfs.path.exists(exp_t.name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(exp_t.name), self.data)
     # check semantics when target dir already exists
     hdfs.rmr(copy_on_wd)
     hdfs.mkdir(copy_on_wd)
     hdfs.cp(src, copy_on_wd, mode="wb")
     exp_t = self.__make_tree(copy_on_wd, root=src_bn, create=False)
     for t, exp_t in czip(src_t.walk(), exp_t.walk()):
         self.assertTrue(hdfs.path.exists(exp_t.name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(exp_t.name), self.data)

예제 #41

0

파일 보기

파일: test_hdfs.py 프로젝트: crs4/pydoop

 def __cp_recursive(self, wd):
     src_t = self.__make_tree(wd)
     src = src_t.name
     copy_on_wd = "%s_copy" % src
     src_bn, copy_on_wd_bn = [
         hdfs.path.basename(d) for d in (src, copy_on_wd)
     ]
     hdfs.cp(src, copy_on_wd, mode="wb")
     exp_t = self.__make_tree(wd, root=copy_on_wd_bn, create=False)
     for t, exp_t in czip(src_t.walk(), exp_t.walk()):
         self.assertTrue(hdfs.path.exists(exp_t.name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(exp_t.name), self.data)
     # check semantics when target dir already exists
     hdfs.rmr(copy_on_wd)
     hdfs.mkdir(copy_on_wd)
     hdfs.cp(src, copy_on_wd, mode="wb")
     exp_t = self.__make_tree(copy_on_wd, root=src_bn, create=False)
     for t, exp_t in czip(src_t.walk(), exp_t.walk()):
         self.assertTrue(hdfs.path.exists(exp_t.name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(exp_t.name), self.data)

예제 #42

0

파일 보기

파일: script.py 프로젝트: ilveroluca/pydoop

  def __setup_remote_paths(self):
    """
    Actually create the working directory and copy the module into it.

    Note: the script has to be readable by Hadoop; though this may not
    generally be a problem on HDFS, where the Hadoop user is usually
    the superuser, things may be different if our working directory is
    on a shared POSIX filesystem.  Therefore, we make the directory
    and the script accessible by all.
    """
    pipes_code = self.__generate_pipes_code()
    hdfs.mkdir(self.remote_wd)
    hdfs.chmod(self.remote_wd, "a+rx")
    hdfs.dump(pipes_code, self.remote_exe)
    hdfs.chmod(self.remote_exe, "a+rx")
    hdfs.put(self.args.module, self.remote_module)
    hdfs.chmod(self.remote_module, "a+r")
    self.__warn_user_if_wd_maybe_unreadable(self.remote_wd)
    self.logger.debug("Created remote paths:")
    self.logger.debug(self.remote_wd)
    self.logger.debug(self.remote_exe)
    self.logger.debug(self.remote_module)

예제 #43

0

파일 보기

파일: hdfs.py 프로젝트: nihil0/hops-util-py

def mkdir(hdfs_path, project=None):
    """
    Create a directory and its parents as needed.

    Args:
        :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to project_name in HDFS).
        :project: If the supplied hdfs_path is a relative path, it will look for that file in this project's subdir in HDFS.

    """
    if project == None:
        project = project_name()
    hdfs_path = _expand_path(hdfs_path, project, exists=False)
    return hdfs.mkdir(hdfs_path)

예제 #44

0

파일 보기

    def __setup_remote_paths(self):
        """
    Actually create the working directory and copy the module into it.

    Note: the script has to be readable by Hadoop; though this may not
    generally be a problem on HDFS, where the Hadoop user is usually
    the superuser, things may be different if our working directory is
    on a shared POSIX filesystem.  Therefore, we make the directory
    and the script accessible by all.
    """
        pipes_code = self.__generate_pipes_code()
        hdfs.mkdir(self.remote_wd)
        hdfs.chmod(self.remote_wd, "a+rx")
        hdfs.dump(pipes_code, self.remote_exe)
        hdfs.chmod(self.remote_exe, "a+rx")
        hdfs.put(self.args.module, self.remote_module)
        hdfs.chmod(self.remote_module, "a+r")
        self.__warn_user_if_wd_maybe_unreadable(self.remote_wd)
        self.logger.debug("Created remote paths:")
        self.logger.debug(self.remote_wd)
        self.logger.debug(self.remote_exe)
        self.logger.debug(self.remote_module)

예제 #45

0

파일 보기

def run_locally(model, input_dirs, output_dir, collate=False):
    hdfs.mkdir(output_dir)
    if collate:
        all_w, all_b = {}, {}
    for d in input_dirs:
        bn = hdfs.path.basename(d)
        weights, biases = get_all_wb(model, d)
        if collate:
            all_w.update({"%s_%s" % (d, t): w for (t, w) in weights.items()})
            all_b.update({"%s_%s" % (d, t): b for (t, b) in biases.items()})
        else:
            w_path = hdfs.path.join(output_dir, "%s_weights.npz" % bn)
            b_path = hdfs.path.join(output_dir, "%s_biases.npz" % bn)
            with hdfs.open(w_path, "wb") as f:
                np.savez(f, **weights)
            with hdfs.open(b_path, "wb") as f:
                np.savez(f, **biases)
    if collate:
        with hdfs.open(hdfs.path.join(output_dir, "weights.npz"), "wb") as f:
            np.savez(f, **all_w)
        with hdfs.open(hdfs.path.join(output_dir, "biases.npz"), "wb") as f:
            np.savez(f, **all_b)

예제 #46

0

파일 보기

파일: spark_example_hdfs_copy.py 프로젝트: revirevy/MSBX5420_Spring2020

def main():
    # this is hdfs directory
    src_dir = str(sys.argv[1])
    dst_dir = str(sys.argv[2])

    # create dst_dir if not exist
    if not pyhdfs.path.exists(dst_dir):
        pyhdfs.mkdir(dst_dir)

    # create sparkcontext
    spark = SparkSession.builder.getOrCreate()
    sc = spark.sparkContext

    # create children path rdd
    children_paths = pyhdfs.ls(src_dir)
    children_paths_rdd = sc.parallelize(children_paths, len(children_paths))

    # each executor task is to copy one children path
    children_paths_rdd.foreach(lambda file_path: copy_file(
        file_path, os.path.join(dst_dir, os.path.basename(file_path))))

    # stop sparkcontext
    sc.stop()

예제 #47

0

파일 보기

파일: pipes.py 프로젝트: muhammadyaseen/pydoop

def run_task(factory, port=None, istream=None, ostream=None,
             private_encoding=True, context_class=TaskContext,
             cmd_file=None, fast_combiner=False, auto_serialize=True):
    """
    Run the assigned task in the framework.

    :rtype: bool
    :return: :obj:`True` if the task succeeded.
    """
    connections = resolve_connections(
        port, istream=istream, ostream=ostream, cmd_file=cmd_file,
        auto_serialize=auto_serialize
    )
    context = context_class(connections.up_link,
                            private_encoding=private_encoding,
                            fast_combiner=fast_combiner)
    stream_runner = StreamRunner(factory, context, connections.cmd_stream)
    pstats_dir = os.getenv(PSTATS_DIR)
    if pstats_dir:
        pstats_fmt = os.getenv(PSTATS_FMT, DEFAULT_PSTATS_FMT)
        hdfs.mkdir(pstats_dir)
        fd, pstats_fn = tempfile.mkstemp(suffix=".pstats")
        os.close(fd)
        cProfile.runctx("stream_runner.run()",
                        {"stream_runner": stream_runner}, globals(),
                        filename=pstats_fn)
        name = pstats_fmt % (
            "r" if context.is_reducer() else "m",
            context.get_task_partition(), os.path.basename(pstats_fn)
        )
        hdfs.put(pstats_fn, hdfs.path.join(pstats_dir, name))
    else:
        stream_runner.run()
    context.close()
    connections.close()
    return True

예제 #48

0

파일 보기

파일: migrate2hdfs.py 프로젝트: yuyiguo/WMArchive

def hdfs_file(odir, name):
    """
    Given HDFS dir and file name create appropriate dir structure on HDFS
    and return full path of the file. We rely on odir/YYYY/MM/DD dir structure.
    """
    tstamp = name.split('/')[-1].split('_')[0] # each file is in form YYYYMMDD_HHMM.ext
    if  not PAT_YYYYMMDD.match(tstamp):
        raise Exception("Given file name '%s' does not contain YYYYMMDD stamp" % name)
    year = tstamp[:4]
    if  not PAT_YYYY.match(year):
        raise Exception("Given file name '%s' does not contain YYYY stamp" % name)
    month = tstamp[4:6]
    if  not PAT_MM.match(month):
        raise Exception("Given file name '%s' does not contain MM stamp" % name)
    day = tstamp[6:8]
    if  not PAT_DD.match(day):
        raise Exception("Given file name '%s' does not contain DD stamp" % name)
    if  not hdfs.path.isdir(odir):
        hdfs.mkdir(odir)
    for subdir in [year, month, day]:
        odir = os.path.join(odir, subdir)
        if  not hdfs.path.isdir(odir):
            hdfs.mkdir(odir)
    return os.path.join(odir, name)

예제 #49

0

파일 보기

 def setUp(self):
     self.path = make_random_str() + UNI_CHR
     hdfs.mkdir(self.path)

예제 #50

0

파일 보기

        city = sys.argv[2].lower()
    except IndexError:
        print "Error in application name (Uber/Lyft) or city name (SF/NYC)."
        sys.exit()
    if app not in {"uber", "lyft"} or city not in {"sf", "nyc"}:
        print "Error in application name (Uber/Lyft) or city name (SF/NYC)."
        sys.exit()

    # Get paths then delete old results and create new path.
    file_path = "hdfs://megatron.ccs.neu.edu/user/jiangshan/ridesharing/raw/" + app + "_" + city + "_raw_response"
    raw_measurement_path = "hdfs://megatron.ccs.neu.edu/user/jiangshan/ridesharing/proc/" + app + "_" + city + "_raw_measurement"
    user_info_path = "resources/" + city + "_user_info.txt"
    try:
        hdfs.rmr(raw_measurement_path)
    except:
        hdfs.mkdir(raw_measurement_path)

    # Get user info  dictionary.
    user_info = get_user_info(user_info_path=user_info_path)

    # Start spark SQL session.
    spark = SparkSession.builder.appName(
        "raw_response_to_raw_measurement").getOrCreate()

    # Traverse each measurement location.
    for data_path in hdfs.ls(file_path):
        data_check = re.search("[A-Z][A-Z]-[0-9][0-9]-[0-9][0-9]", data_path)
        if not data_check:
            continue
        user = user_info[data_check.group()]
        # Load response data RDD.

예제 #51

0

파일 보기

파일: put_dataset.py 프로젝트: crs4/hadoop-galaxy

def perform_copy(options):
    with open(options.src_pathset) as f:
        input_pathset = FilePathset.from_file(f)

    # set up workspace
    workspace = options.workspace
    log.info("Workspace set to %s", workspace)
    if not phdfs.path.exists(workspace):
        log.info("Workspace directory %s doesn't exist. Creating it.", workspace)
        phdfs.mkdir(workspace)

    src_paths = [ p for p in input_pathset ]
    log.debug("Source paths (first 5 or less): %s", src_paths[0:5])

    # dest_path is a unique path under the workspace whose name should be the same
    # as the Galaxy dataset name.
    dest_path = phdfs.path.join(workspace, phdfs.path.basename(options.output_dataset))
    log.info("Destination path: %s", dest_path)
    if phdfs.path.exists(dest_path):
        raise RuntimeError("Destination path %s already exists. Did you provide a valid Galaxy output dataset argument?" % dest_path)

    # We need to run a separate copy operation for each "leaf" destination
    # directory.  E.g.,
    #   /tmp/dirA/file1 /tmp/dirA/file2 -> workspace/dirA/
    #   /tmp/dirB/file1                 -> workspace/dirB/
    #
    # As shown in the example, in general we cannot be sure the source paths have
    # unique basenames. We also cannot rename multiple files on-the-fly (to a new
    # name guaranteed to be unique, such as a uuid4).  So, to reduce the number of
    # distcp or cp invocations, we group the source paths together by destination directory
    # (in the example, dirA and dirB).

    # expand for wildcards
    src_uris = [ u for wild in src_paths for u in expand_paths(urlparse(wild)) ]
    log.debug("first 5 src_uris: %s", src_uris[0:5])
    destination_uris = [ src_to_dest_path(dest_path, u) for u in src_uris ]
    log.debug("first 5 destination_uris: %s", destination_uris[0:5])
    copy_groups = _group_by_dest_dir(src_uris, destination_uris)
    if log.isEnabledFor(logging.DEBUG) and len(copy_groups) > 0:
        tpl = next(copy_groups.iteritems())
        log.debug("one copy group:\n\tdest: %s\n\tsrc: %s", tpl[0], tpl[1])

    try:
        if options.distcp:
            perform_distcp(copy_groups)
        else:
            perform_simple_cp(copy_groups)
    except Exception as e:
        log.critical("Failed to copy data to %s", dest_path)
        log.exception(e)
        log.info("Cleaning up %s, if it exists", dest_path)
        try:
            phdfs.rmr(dest_path)
        except IOError:
            log.debug("Failed to clean-up destination path %s. Maybe it was never created.", dest_path)
        raise e
    output_pathset = FilePathset(*copy_groups.iterkeys())
    output_pathset.set_datatype(input_pathset.datatype)
    output_pathset.comment = "Copied from\n" + '\n'.join(src_paths)
    with open(options.output_dataset, 'w') as f:
        output_pathset.write(f)

예제 #52

0

파일 보기

파일: test_performances.py 프로젝트: kikkomep/pydoop

def main(argv):

    logger = logging.getLogger("main")
    logger.setLevel(logging.DEBUG)

    with Timer() as total_time:

        parser = make_parser()
        args = parser.parse_args(argv)
        if args.dataset:
            print args.dataset
            create_dataset(logger, args.dataset)

        if args.script:
            piped_code_file = args.script
        else:
            piped_code_file = DEFAULT_SCRIPT

        if not os.path.exists(piped_code_file):
            raise IOError("script {0} not found !!!".format(piped_code_file))

        with open(piped_code_file) as f:
            pipes_code = pts.add_sys_path(f.read())

        dataset = [d for d in os.listdir("dataset") if d.endswith("MB")]
        dataset.sort(cmp=lambda x, y: cmp(
            int(x.replace("MB", "")), int(y.replace("MB", ""))
        ))

        logger.info(" Uploading dataset: { %s }", ', '.join(dataset))
        if not hadut.path_exists(os.path.join(DATASET_DIR)):
            logger.info("  dataset folder created")
            hdfs.mkdir(DATASET_DIR)

        for data_filename in dataset:
            source_path = os.path.join(DATASET_DIR, data_filename)
            dest_path = os.path.join(DATASET_DIR, data_filename)

            if not hadut.path_exists(os.path.join(DATASET_DIR, data_filename)):
                logger.info(" -> uploading %s...", source_path)
                hdfs.put(source_path, dest_path)

        update_conf(args)

        results = dict()
        for data_input in dataset:

            with Timer() as t:
                runner = hadut.PipesRunner(prefix=PREFIX, logger=logger)
                logger.info("Running the script %s with data input %s..",
                            piped_code_file, data_input)
                data_input_path = os.path.join(DATASET_DIR, data_input)
                runner.set_input(data_input_path, put=False)
                runner.set_exe(pipes_code)
                runner.run(properties=CONF, hadoop_conf_dir=HADOOP_CONF_DIR,
                           logger=logger)
                res = runner.collect_output()
                print data_input_path
                local_wc = pts.LocalWordCount(data_input_path)
                logging.info(local_wc.check(res))
                #print res
                #runner.clean()
            results[data_input] = (t.secs, t.msecs)

    print "\n\n RESULTs"
    print "=" * (len(piped_code_file) + 15)
    print " *  script: {0}".format(piped_code_file)
    print " *  mappers: {0}".format(CONF["mapred.map.tasks"])
    print " *  reducers: {0}".format(CONF["mapred.reduce.tasks"])
    print " *  dataset: [{0}]".format(",".join(dataset))
    print " *  times (input -> secs):"
    for data_input in dataset:
        print "    - {0} -> {1} secs.".format(
            data_input, results[data_input][0]
        )
    print "\n => Total execution time: {0}".format(total_time.secs)
    print "=" * (len(piped_code_file) + 15)
    print "\n"

예제 #53

0

파일 보기

파일: swift_to_hdfs.py 프로젝트: rmchamberlain/w251-project

import os
import itertools

import swiftclient
import pydoop.hdfs as hdfs

container = 'w251-enron'
prefix = 'clean_v2'
hdfs_prefix = '/enron'

authurl = os.environ['SWIFT_AUTH_URL']
user = os.environ['SWIFT_USER']
key = os.environ['SWIFT_KEY']

conn = swiftclient.client.Connection(
        authurl=authurl, user=user, key=key)

header, objects = conn.get_container(container, prefix=prefix, full_listing=True)

hdfs.mkdir(hdfs_prefix)

total = len(objects)
count = 1
for obj in objects:
    name = obj['name']
    print 'Downloading %s (%d of %d)' % (name, count, total)
    header, contents = conn.get_object(container, name)
    filename = name.replace('/', '_')
    hdfs.dump(contents, '%s/%s' % (hdfs_prefix, filename))
    count += 1

예제 #54

0

파일 보기

파일: test_path.py 프로젝트: kikkomep/pydoop

 def setUp(self):
     self.path = make_random_str() + UNI_CHR
     hdfs.mkdir(self.path)

예제 #55

0

파일 보기

파일: db.py 프로젝트: vnt-github/keystone

 def open_spider(self, spider):
     self.output_dir = spider.tmp_dir
     self.output_file = f"fg_{spider.html_format}.jsonlines"
     hdfs.mkdir(f"{self.output_dir}")
     self.f = hdfs.open(f"{self.output_dir}/{self.output_file}", "wt")

예제 #56

0

파일 보기

파일: hadoop_logs_mgm.py 프로젝트: hopshadoop/hops-hadoop-chef

def create_remote_dir(remote_dir):
    hdfs.mkdir(remote_dir)
    logging.getLogger(__name__).debug("Creating remote directory {0}".format(remote_dir))

예제 #57

0

파일 보기

파일: __init__.py 프로젝트: diffeo/poly-open

def _poly_mkdir(path, *args, **kwargs):
    if path.startswith('hdfs:'):
        return hdfs.mkdir(path, *args, **kwargs)
    else:
        return os.mkdir(path)