Exemplos de cp em Python, exemplos de pydoop.hdfs.cp em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: submit.py Projeto: muhammadyaseen/pydoop

    def __setup_remote_paths(self):
        """
        Actually create the working directory and copy the module into it.

        Note: the script has to be readable by Hadoop; though this may not
        generally be a problem on HDFS, where the Hadoop user is usually
        the superuser, things may be different if our working directory is
        on a shared POSIX filesystem.  Therefore, we make the directory
        and the script accessible by all.
        """
        self.logger.debug("remote_wd: %s", self.remote_wd)
        self.logger.debug("remote_exe: %s", self.remote_exe)
        self.logger.debug("remotes: %s", self.files_to_upload)
        if self.args.module:
            self.logger.debug('Generated pipes_code:\n\n %s',
                              self._generate_pipes_code())
        if not self.args.pretend:
            hdfs.mkdir(self.remote_wd)
            hdfs.chmod(self.remote_wd, "a+rx")
            self.logger.debug("created and chmod-ed: %s", self.remote_wd)
            pipes_code = self._generate_pipes_code()
            hdfs.dump(pipes_code, self.remote_exe)
            self.logger.debug("dumped pipes_code to: %s", self.remote_exe)
            hdfs.chmod(self.remote_exe, "a+rx")
            self.__warn_user_if_wd_maybe_unreadable(self.remote_wd)
            for (l, h, _) in self.files_to_upload:
                self.logger.debug("uploading: %s to %s", l, h)
                hdfs.cp(l, h)
        self.logger.debug("Created%sremote paths:" %
                          (' [simulation] ' if self.args.pretend else ' '))

Exemplo n.º 2

0

Exibir arquivo

Arquivo: submit.py Projeto: kikkomep/pydoop

    def __setup_remote_paths(self):
        """
        Actually create the working directory and copy the module into it.

        Note: the script has to be readable by Hadoop; though this may not
        generally be a problem on HDFS, where the Hadoop user is usually
        the superuser, things may be different if our working directory is
        on a shared POSIX filesystem.  Therefore, we make the directory
        and the script accessible by all.
        """
        self.logger.debug("remote_wd: %s", self.remote_wd)
        self.logger.debug("remote_exe: %s", self.remote_exe)
        self.logger.debug("remotes: %s", self.files_to_upload)
        if self.args.module:
            self.logger.debug(
                'Generated pipes_code:\n\n %s', self._generate_pipes_code()
            )
        if not self.args.pretend:
            hdfs.mkdir(self.remote_wd)
            hdfs.chmod(self.remote_wd, "a+rx")
            self.logger.debug("created and chmod-ed: %s", self.remote_wd)
            pipes_code = self._generate_pipes_code()
            hdfs.dump(pipes_code, self.remote_exe)
            self.logger.debug("dumped pipes_code to: %s", self.remote_exe)
            hdfs.chmod(self.remote_exe, "a+rx")
            self.__warn_user_if_wd_maybe_unreadable(self.remote_wd)
            for (l, h, _) in self.files_to_upload:
                self.logger.debug("uploading: %s to %s", l, h)
                hdfs.cp(l, h)
        self.logger.debug("Created%sremote paths:" %
                          (' [simulation] ' if self.args.pretend else ' '))

Exemplo n.º 3

0

Exibir arquivo

Arquivo: spark_example_hdfs_copy.py Projeto: revirevy/MSBX5420_Spring2020

def copy_file(src_path, dst_path):
    """
    copy one path
    :param src_path:
    :param dst_path:
    """
    if not pyhdfs.path.exists(dst_path):
        pyhdfs.cp(src_path, dst_path)

Exemplo n.º 4

0

Exibir arquivo

 def __cp_dir(self, wd):
     src_dir = "%s/src_dir" % wd
     hdfs.mkdir(src_dir)
     copy_on_wd = "%s/src_dir_copy" % wd
     copy_on_copy_on_wd = "%s/src_dir" % copy_on_wd
     hdfs.cp(src_dir, copy_on_wd, mode="wb")
     self.assertTrue(hdfs.path.exists(copy_on_wd))
     hdfs.cp(src_dir, copy_on_wd, mode="wb")
     self.assertTrue(hdfs.path.exists(copy_on_copy_on_wd))
     self.assertRaises(IOError, hdfs.cp, src_dir, copy_on_wd)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: test_hdfs.py Projeto: ZEMUSHKA/pydoop

 def __cp_dir(self, wd):
   src_dir = "%s/src_dir" % wd
   hdfs.mkdir(src_dir)
   copy_on_wd = "%s/src_dir_copy" % wd
   copy_on_copy_on_wd = "%s/src_dir" % copy_on_wd
   hdfs.cp(src_dir, copy_on_wd)
   self.assertTrue(hdfs.path.exists(copy_on_wd))
   hdfs.cp(src_dir, copy_on_wd)
   self.assertTrue(hdfs.path.exists(copy_on_copy_on_wd))
   self.assertRaises(IOError, hdfs.cp, src_dir, copy_on_wd)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: test_hdfs.py Projeto: ZEMUSHKA/pydoop

 def __cp_file(self, wd):
   fn = "%s/fn" % wd
   hdfs.dump(self.data, fn)
   dest_dir = "%s/dest_dir" % wd
   hdfs.mkdir(dest_dir)
   fn_copy_on_wd = "%s/fn_copy" % wd
   hdfs.cp(fn, fn_copy_on_wd)
   self.assertEqual(hdfs.load(fn_copy_on_wd), self.data)
   self.assertRaises(IOError, hdfs.cp, fn, fn_copy_on_wd)
   fn_copy_on_dest_dir = "%s/fn" % dest_dir
   hdfs.cp(fn, dest_dir)
   self.assertEqual(hdfs.load(fn_copy_on_dest_dir), self.data)
   self.assertRaises(IOError, hdfs.cp, fn, dest_dir)

Exemplo n.º 7

0

Exibir arquivo

 def __cp_file(self, wd):
     fn = "%s/fn" % wd
     hdfs.dump(self.data, fn, mode="wb")
     dest_dir = "%s/dest_dir" % wd
     hdfs.mkdir(dest_dir)
     fn_copy_on_wd = "%s/fn_copy" % wd
     hdfs.cp(fn, fn_copy_on_wd, mode="wb")
     self.assertEqual(hdfs.load(fn_copy_on_wd), self.data)
     self.assertRaises(IOError, hdfs.cp, fn, fn_copy_on_wd)
     fn_copy_on_dest_dir = "%s/fn" % dest_dir
     hdfs.cp(fn, dest_dir, mode="wb")
     self.assertEqual(hdfs.load(fn_copy_on_dest_dir), self.data)
     self.assertRaises(IOError, hdfs.cp, fn, dest_dir)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: hdfs.py Projeto: Tha-Robert/hops-util-py

def cp(src_hdfs_path, dest_hdfs_path):
    """
    Copy the contents of src_hdfs_path to dest_hdfs_path.

    If src_hdfs_path is a directory, its contents will be copied recursively. Source file(s) are opened for reading and copies are opened for writing.

    Args:
        :src_hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS).
        :dest_hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS).

    """
    src_hdfs_path = _expand_path(src_hdfs_path)
    dest_hdfs_path = _expand_path(dest_hdfs_path)
    hdfs.cp(src_hdfs_path, dest_hdfs_path)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: test_hdfs.py Projeto: ZEMUSHKA/pydoop

 def __cp_recursive(self, wd):
   src_t = self.__make_tree(wd)
   src = src_t.name
   copy_on_wd = "%s_copy" % src
   src_bn, copy_on_wd_bn = [hdfs.path.basename(d) for d in (src, copy_on_wd)]
   hdfs.cp(src, copy_on_wd)
   for t in src_t.walk():
     copy_name = t.name.replace(src_bn, copy_on_wd_bn)
     self.assertTrue(hdfs.path.exists(copy_name))
     if t.kind == 0:
       self.assertEqual(hdfs.load(copy_name), self.data)
   hdfs.cp(src, copy_on_wd)
   for t in src_t.walk():
     copy_name = t.name.replace(src_bn, "%s/%s" % (copy_on_wd_bn, src_bn))
     self.assertTrue(hdfs.path.exists(copy_name))
     if t.kind == 0:
       self.assertEqual(hdfs.load(copy_name), self.data)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: dist_text_zipper.py Projeto: crs4/hadoop-galaxy

 def run(self):
     exit_code = 1
     with tempfile.NamedTemporaryFile() as f:
         self.log.debug("opened scratch MR job input file %s", f.name)
         # We write the files to be compressed to a temporary file.  Later we'll re-read
         # this temporary file to rename the files as well.  I've opted not to keep the
         # table in memory in the hope of scaling better to jobs with a large number of
         # files (we reduce memory requirements).
         num_files = self.__write_mr_input(f)
         f.flush()
         self.log.debug("Finished writing temp input file")
         input_filename = tempfile.mktemp(dir=os.path.dirname(self.output_path), prefix="dist_txt_zipper_input")
         tmpfile_uri = "file://%s" % f.name
         try:
             self.log.debug("copying input from %s to %s", tmpfile_uri, input_filename)
             hdfs.cp(tmpfile_uri, input_filename)
             self.log.info("Run analyzed.  Launching distributed job")
             # launch mr task
             pydoop_args = \
                 [ 'script', '--num-reducers', '0','--kv-separator', '',
                   '-Dmapred.map.tasks=%d' % num_files,
                   '-Dmapred.input.format.class=org.apache.hadoop.mapred.lib.NLineInputFormat',
                   '-Dmapred.line.input.format.linespermap=1',
                   '-Dmapred.output.compress=true',
                   '-Dmapred.output.compression.codec=%s' % 'org.apache.hadoop.io.compress.GzipCodec',
                   text_zipper_mr.__file__,
                   input_filename,
                   self.output_path]
             self.log.debug("pydoop_args: %s", pydoop_args)
             self.log.info("Compressing %s files", num_files)
             pydoop_app.main(pydoop_args)
             self.log.info("Distributed job complete")
             self.rename_compressed_files(f)
             self.log.info("finished")
             exit_code = 0
         finally:
             try:
                 self.log.debug("Removing temporary input file %s", input_filename)
                 hdfs.rmr(input_filename)
             except IOError as e:
                 self.log.warning("Problem cleaning up.  Error deleting temporary input file %s", input_filename)
                 self.log.exception(str(e))
         return exit_code

Exemplo n.º 11

0

Exibir arquivo

Arquivo: test_hdfs.py Projeto: xuande/pydoop

 def __cp_recursive(self, wd):
     src_t = self.__make_tree(wd)
     src = src_t.name
     copy_on_wd = "%s_copy" % src
     src_bn, copy_on_wd_bn = [
         hdfs.path.basename(d) for d in (src, copy_on_wd)
     ]
     hdfs.cp(src, copy_on_wd)
     for t in src_t.walk():
         copy_name = t.name.replace(src_bn, copy_on_wd_bn)
         self.assertTrue(hdfs.path.exists(copy_name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(copy_name), self.data)
     hdfs.cp(src, copy_on_wd)
     for t in src_t.walk():
         copy_name = t.name.replace(src_bn,
                                    "%s/%s" % (copy_on_wd_bn, src_bn))
         self.assertTrue(hdfs.path.exists(copy_name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(copy_name), self.data)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: hdfs.py Projeto: nihil0/hops-util-py

def cp(src_hdfs_path, dest_hdfs_path, overwrite=False):
    """
    Copy the contents of src_hdfs_path to dest_hdfs_path.

    If src_hdfs_path is a directory, its contents will be copied recursively.
    Source file(s) are opened for reading and copies are opened for writing.

    Args:
        :src_hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS).
        :dest_hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS).
        :overwrite: boolean flag whether to overwrite destination path or not.

    """
    src_hdfs_path = _expand_path(src_hdfs_path)
    dest_hdfs_path = _expand_path(dest_hdfs_path, exists=False)

    if overwrite and exists(dest_hdfs_path):
        # delete path since overwrite flag was set to true
        delete(dest_hdfs_path, recursive=True)

    hdfs.cp(src_hdfs_path, dest_hdfs_path)

Exemplo n.º 13

0

Exibir arquivo

 def __cp_recursive(self, wd):
     src_t = self.__make_tree(wd)
     src = src_t.name
     copy_on_wd = "%s_copy" % src
     src_bn, copy_on_wd_bn = [
         hdfs.path.basename(d) for d in (src, copy_on_wd)
     ]
     hdfs.cp(src, copy_on_wd, mode="wb")
     exp_t = self.__make_tree(wd, root=copy_on_wd_bn, create=False)
     for t, exp_t in czip(src_t.walk(), exp_t.walk()):
         self.assertTrue(hdfs.path.exists(exp_t.name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(exp_t.name), self.data)
     # check semantics when target dir already exists
     hdfs.rmr(copy_on_wd)
     hdfs.mkdir(copy_on_wd)
     hdfs.cp(src, copy_on_wd, mode="wb")
     exp_t = self.__make_tree(copy_on_wd, root=src_bn, create=False)
     for t, exp_t in czip(src_t.walk(), exp_t.walk()):
         self.assertTrue(hdfs.path.exists(exp_t.name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(exp_t.name), self.data)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: test_hdfs.py Projeto: crs4/pydoop

 def __cp_recursive(self, wd):
     src_t = self.__make_tree(wd)
     src = src_t.name
     copy_on_wd = "%s_copy" % src
     src_bn, copy_on_wd_bn = [
         hdfs.path.basename(d) for d in (src, copy_on_wd)
     ]
     hdfs.cp(src, copy_on_wd, mode="wb")
     exp_t = self.__make_tree(wd, root=copy_on_wd_bn, create=False)
     for t, exp_t in czip(src_t.walk(), exp_t.walk()):
         self.assertTrue(hdfs.path.exists(exp_t.name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(exp_t.name), self.data)
     # check semantics when target dir already exists
     hdfs.rmr(copy_on_wd)
     hdfs.mkdir(copy_on_wd)
     hdfs.cp(src, copy_on_wd, mode="wb")
     exp_t = self.__make_tree(copy_on_wd, root=src_bn, create=False)
     for t, exp_t in czip(src_t.walk(), exp_t.walk()):
         self.assertTrue(hdfs.path.exists(exp_t.name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(exp_t.name), self.data)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: dist_bcl2qseq.py Projeto: ilveroluca/seal

    def run(self):
        pydoop_exec = self.find_exec('pydoop')
        if pydoop_exec is None:
            raise RuntimeError("Can't find pydoop executable in PATH")

        with tempfile.NamedTemporaryFile() as f:
            num_records = self.__write_mr_input(f)
            f.flush()
            self.log.debug("Wrote temp input file %s", f.name)
            input_filename = tempfile.mktemp(dir=os.path.dirname(self.output_path), prefix="dist_bcl2qseq_input")
            tmpfile_uri = "file://%s" % f.name
            try:
                self.log.debug("copying input from %s to %s", tmpfile_uri, input_filename)
                hdfs.cp(tmpfile_uri, input_filename)
                self.log.info("Run analyzed.  Launching distributed job")
                # launch mr task
                cmd = [ 'pydoop', 'script', '--num-reducers', '0', '--kv-separator', '',
                        '-Dmapred.map.tasks=%d' % num_records,
                        '-Dmapred.input.format.class=org.apache.hadoop.mapred.lib.NLineInputFormat',
                        '-Dmapred.line.input.format.linespermap=1',
                        bcl2qseq_mr.__file__,
                        input_filename,
                        self.output_path]
                self.log.debug(str(cmd))
                subprocess.check_call(cmd)
                self.log.info("Distributed job complete")
            except subprocess.CalledProcessError as e:
                self.log.exception(e)
                self.log.error("Error running pydoop script component")
                raise
            finally:
                try:
                    hdfs.rmr(input_filename)
                except IOError as e:
                    self.log.debug("Problem cleaning up.  Error deleting temporary input file %s", input_filename)
                    self.log.debug(str(e))

Exemplo n.º 16

0

Exibir arquivo

Arquivo: test_hdfs.py Projeto: kikkomep/pydoop

    def thread_allow(self):
        # test whether our code is properly allowing other python threads to
        # make progress while we're busy doing I/O
        class BusyCounter(Thread):
            def __init__(self):
                super(BusyCounter, self).__init__()
                self.done = False
                self._count = 0

            @property
            def count(self):
                return self._count

            def run(self):
                while not self.done:
                    self._count += 1

        class BusyContext(object):
            def __init__(self):
                self.counter = None

            def __enter__(self):
                self.counter = BusyCounter()
                self.counter.start()

            def __exit__(self, _1, _2, _3):
                self.counter.done = True
                self.counter.join()

            @property
            def count(self):
                return self.counter.count

        some_data = "a" * (5 * 1024 * 1024)  # 5 MB
        counter = BusyContext()

        ###########################
        acceptable_threshold = 5
        # The tests were sometimes failing on TravisCI (slower machines) with
        # counts below 100.  A test where we left the GIL locked showed that in
        # that case counter value doesn't change at all across calls, so in
        # theory even an increment of 1 would demonstrate that the mechanism is
        # working.

        # If the hdfs call doesn't release the GIL, the counter won't make any
        # progress during the HDFS call and will be stuck at 0.  On the other
        # hand, if the GIL is release during the operation we'll see a count
        # value > 0.
        fs = hdfs.hdfs("default", 0)
        with fs.open_file(self.hdfs_paths[0], "w") as f:
            with counter:
                f.write(some_data)
            self.assertGreaterEqual(counter.count, acceptable_threshold)

        with fs.open_file(self.hdfs_paths[0], "r") as f:
            with counter:
                f.read()
            self.assertGreaterEqual(counter.count, acceptable_threshold)

        with counter:
            fs.get_hosts(self.hdfs_paths[0], 0, 10)
        self.assertGreaterEqual(counter.count, acceptable_threshold)

        with counter:
            fs.list_directory('/')
        self.assertGreaterEqual(counter.count, acceptable_threshold)

        with counter:
            hdfs.cp(self.hdfs_paths[0], self.hdfs_paths[0] + '_2')
        self.assertGreaterEqual(counter.count, acceptable_threshold)

        with counter:
            hdfs.rmr(self.hdfs_paths[0] + '_2')
        self.assertGreaterEqual(counter.count, acceptable_threshold)

Exemplo n.º 17

0

Exibir arquivo

    def copy_and_compare(self, metadata, instance_guid, file_name):

        src_db_name = reg.db_name(metadata)
        src_db_table = reg.db_table(metadata, type='work')
        work_db_name = reg.db_name(metadata, stage="valid", type='work')
        work_db_table = reg.db_table(metadata, stage="valid", type='work')
        invalid_reason = {}

        if 'fileUpdateType' in metadata['file']['technical']:
            update_type = metadata['file']['technical']['fileUpdateType']
        else:
            update_type = 'append'

        field_order = sorted(metadata['fields'], key=lambda k: k['position'])
        select_list = partition = ''
        partition_list = []
        for field in field_order:  # build the field list for the create statement
            if 'partitionPosition' in field:
                partition_list.append(field)
            else:
                # select_list += field["name"] + ', '
                select_list += field_conversion(field) + ', '

        now = time.time()
        logger.info('Comparison start')

        # dropping work table if it exists
        self._query('drop table if exists ' + work_db_name + '.' +
                    work_db_table)

        self.create_hive_table(metadata, stage="valid", type='work')
        try:
            self.copy_table_data(metadata, instance_guid, copy_type='compare')
        except RuntimeError:
            invalid_reason[
                "badMetadata"] = "Metadata didn't match file and caused hive to fail, check ingestion logs"
        except impala.dbapi.OperationalError:
            invalid_reason[
                "badMetadata"] = "Metadata didn't match file and caused hive to fail, check ingestion logs"

        logger.info('Data loaded to validation table')

        compare = ('SELECT count(*) FROM ' + src_db_name + '.' + src_db_table +
                   ' where ')
        for field in metadata['fields']:
            if str(field['datatype']).upper() in {'TIMESTAMP', 'DATE'}:
                compare += '((' + field["name"] + ' is not null and length(' + field["name"] + ') > 0) and ' +\
                           field_conversion(field) + ' is null) or '

            elif str(field['datatype']).upper() in {'FLOAT'}:
                compare += '((' + field["name"] + ' is not null  and length(' + field["name"] + ') > 0) and  not(' +\
                           field_conversion(field) + ' <=> cast(' + field["name"] + ' as float))) or '

            elif str(field['datatype']).upper() == 'BOOLEAN':
                compare += '( not(' + field_conversion(
                    field) + ' <=> ' + field["name"] + ')) or '

            elif str(field['datatype']).upper() == 'BINARY':
                pass

            else:
                compare += '((' + field["name"] + ' is not null  and length(' + field["name"] + ') > 0) and  not(' +\
                           field_conversion(field) + ' <=> ' + field["name"] + ')) or '
        compare = compare[:-3]

        logger.info('comparison query : ' + compare)

        raw_rows = valid_rows = invalid_rows = 0
        try:
            self._cur.execute(compare)
        except Exception as e:
            logger.info(e.message)
            raise

        logger.info('Valid data check query complete')
        # get the number of invalid rows from the comparison query
        for row in self._cur:
            invalid_rows = row[0]

        if invalid_rows > 0:
            logger.info(str(invalid_rows) + " invalid rows ")
            invalid_reason["datatypeMismatch"] = invalid_rows
        else:
            logger.info("All copied rows are valid")

        raw_rows = self.row_count(src_db_name + '.' + src_db_table)
        logger.info('Raw row count complete')

        valid_rows = self.row_count(
            work_db_name + '.' + work_db_table,
            'instance_guid = "' + str(instance_guid) + '"')
        logger.info('Valid row count complete')

        logger.info("Raw rows = " + str(raw_rows) + ": Valid Rows = " +
                    str(valid_rows))
        if raw_rows - valid_rows != 0:
            logger.info("Mismatch count = " + str(raw_rows - valid_rows))
            invalid_reason["rowCountMismatch"] = raw_rows - valid_rows

        logger.info("End copy and compare" + str(time.time()))
        logger.info("finished in " + str(time.time() - now) + " seconds")

        if len(invalid_reason) > 0:
            reg.register_invalid(metadata, instance_guid, file_name,
                                 invalid_reason, valid_rows, compare)
        else:

            # Adding append vs full file logic
            if 'fileUpdateType' in metadata['file']['technical']:

                if update_type == 'append':
                    # this is the default path so we don't do anything
                    logging.info('append file')
                    pass
                elif update_type == 'full':
                    # delete everything in the valid file location
                    logger.info('Deleting existing data from valid table')
                    if hdfs.path.exists(reg.file_path(metadata,
                                                      stage="valid")):
                        hdfs.rmr(reg.file_path(metadata, stage="valid"))
                elif update_type == 'delta':
                    logging.info('delta file')
                    self.delta(metadata)
                else:
                    logging.info('update type blank, treating as append file')
            else:
                logging.info(
                    'no update type or update type null, treating as append file'
                )

            self.create_hive_table(metadata, stage="valid")
            try:
                self.copy_table_data(metadata,
                                     instance_guid,
                                     valid_copy=True,
                                     update_type=update_type)
                reg.register_valid(metadata, instance_guid, file_name,
                                   valid_rows, compare)
                self._query('drop table if exists ' + work_db_name + '.' +
                            work_db_table)
                if 'fileUpdateType' in metadata['file'][
                        'technical'] and metadata['file']['technical'][
                            'fileUpdateType'] == 'full':
                    logger.info('Deleting existing data from raw table')
                    if hdfs.path.exists(reg.file_path(metadata, stage="raw")):
                        hdfs.rmr(reg.file_path(metadata, stage="raw"))
                    hdfs.cp(reg.file_path(metadata, stage="raw", type='work'),
                            reg.file_path(metadata, stage="raw"))

            except RuntimeError or impala.dbapi.OperationalError:
                invalid_reason["badMetadata"] = "Metadata didn't match file and " + \
                        "caused hive to fail, check ingestion logs"
                reg.register_invalid(metadata, instance_guid, file_name,
                                     invalid_reason, valid_rows, compare)

Exemplo n.º 18

0

Exibir arquivo

    def thread_allow(self):
        # test whether our code is properly allowing other python threads to
        # make progress while we're busy doing I/O
        class BusyCounter(Thread):
            def __init__(self):
                super(BusyCounter, self).__init__()
                self.done = False
                self._count = 0

            @property
            def count(self):
                return self._count

            def run(self):
                while not self.done:
                    self._count += 1

        class BusyContext(object):
            def __init__(self):
                self.counter = None

            def __enter__(self):
                self.counter = BusyCounter()
                self.counter.start()

            def __exit__(self, _1, _2, _3):
                self.counter.done = True
                self.counter.join()

            @property
            def count(self):
                return self.counter.count

        some_data = b"a" * (5 * 1024 * 1024)  # 5 MB
        counter = BusyContext()

        ###########################
        acceptable_threshold = 5
        # The tests were sometimes failing on TravisCI (slower machines) with
        # counts below 100.  A test where we left the GIL locked showed that in
        # that case counter value doesn't change at all across calls, so in
        # theory even an increment of 1 would demonstrate that the mechanism is
        # working.

        # If the hdfs call doesn't release the GIL, the counter won't make any
        # progress during the HDFS call and will be stuck at 0.  On the other
        # hand, if the GIL is release during the operation we'll see a count
        # value > 0.
        fs = hdfs.hdfs("default", 0)
        with fs.open_file(self.hdfs_paths[0], "w") as f:
            with counter:
                f.write(some_data)
            self.assertGreaterEqual(counter.count, acceptable_threshold)

        with fs.open_file(self.hdfs_paths[0], "r") as f:
            with counter:
                f.read()
            self.assertGreaterEqual(counter.count, acceptable_threshold)

        with counter:
            fs.get_hosts(self.hdfs_paths[0], 0, 10)
        self.assertGreaterEqual(counter.count, acceptable_threshold)

        with counter:
            fs.list_directory('/')
        self.assertGreaterEqual(counter.count, acceptable_threshold)

        with counter:
            hdfs.cp(self.hdfs_paths[0], self.hdfs_paths[0] + '_2', mode="wb")
        self.assertGreaterEqual(counter.count, acceptable_threshold)

        with counter:
            hdfs.rmr(self.hdfs_paths[0] + '_2')
        self.assertGreaterEqual(counter.count, acceptable_threshold)