def __setup_remote_paths(self): """ Actually create the working directory and copy the module into it. Note: the script has to be readable by Hadoop; though this may not generally be a problem on HDFS, where the Hadoop user is usually the superuser, things may be different if our working directory is on a shared POSIX filesystem. Therefore, we make the directory and the script accessible by all. """ self.logger.debug("remote_wd: %s", self.remote_wd) self.logger.debug("remote_exe: %s", self.remote_exe) self.logger.debug("remotes: %s", self.files_to_upload) if self.args.module: self.logger.debug('Generated pipes_code:\n\n %s', self._generate_pipes_code()) if not self.args.pretend: hdfs.mkdir(self.remote_wd) hdfs.chmod(self.remote_wd, "a+rx") self.logger.debug("created and chmod-ed: %s", self.remote_wd) pipes_code = self._generate_pipes_code() hdfs.dump(pipes_code, self.remote_exe) self.logger.debug("dumped pipes_code to: %s", self.remote_exe) hdfs.chmod(self.remote_exe, "a+rx") self.__warn_user_if_wd_maybe_unreadable(self.remote_wd) for (l, h, _) in self.files_to_upload: self.logger.debug("uploading: %s to %s", l, h) hdfs.cp(l, h) self.logger.debug("Created%sremote paths:" % (' [simulation] ' if self.args.pretend else ' '))
def __setup_remote_paths(self): """ Actually create the working directory and copy the module into it. Note: the script has to be readable by Hadoop; though this may not generally be a problem on HDFS, where the Hadoop user is usually the superuser, things may be different if our working directory is on a shared POSIX filesystem. Therefore, we make the directory and the script accessible by all. """ self.logger.debug("remote_wd: %s", self.remote_wd) self.logger.debug("remote_exe: %s", self.remote_exe) self.logger.debug("remotes: %s", self.files_to_upload) if self.args.module: self.logger.debug( 'Generated pipes_code:\n\n %s', self._generate_pipes_code() ) if not self.args.pretend: hdfs.mkdir(self.remote_wd) hdfs.chmod(self.remote_wd, "a+rx") self.logger.debug("created and chmod-ed: %s", self.remote_wd) pipes_code = self._generate_pipes_code() hdfs.dump(pipes_code, self.remote_exe) self.logger.debug("dumped pipes_code to: %s", self.remote_exe) hdfs.chmod(self.remote_exe, "a+rx") self.__warn_user_if_wd_maybe_unreadable(self.remote_wd) for (l, h, _) in self.files_to_upload: self.logger.debug("uploading: %s to %s", l, h) hdfs.cp(l, h) self.logger.debug("Created%sremote paths:" % (' [simulation] ' if self.args.pretend else ' '))
def copy_file(src_path, dst_path): """ copy one path :param src_path: :param dst_path: """ if not pyhdfs.path.exists(dst_path): pyhdfs.cp(src_path, dst_path)
def __cp_dir(self, wd): src_dir = "%s/src_dir" % wd hdfs.mkdir(src_dir) copy_on_wd = "%s/src_dir_copy" % wd copy_on_copy_on_wd = "%s/src_dir" % copy_on_wd hdfs.cp(src_dir, copy_on_wd, mode="wb") self.assertTrue(hdfs.path.exists(copy_on_wd)) hdfs.cp(src_dir, copy_on_wd, mode="wb") self.assertTrue(hdfs.path.exists(copy_on_copy_on_wd)) self.assertRaises(IOError, hdfs.cp, src_dir, copy_on_wd)
def __cp_dir(self, wd): src_dir = "%s/src_dir" % wd hdfs.mkdir(src_dir) copy_on_wd = "%s/src_dir_copy" % wd copy_on_copy_on_wd = "%s/src_dir" % copy_on_wd hdfs.cp(src_dir, copy_on_wd) self.assertTrue(hdfs.path.exists(copy_on_wd)) hdfs.cp(src_dir, copy_on_wd) self.assertTrue(hdfs.path.exists(copy_on_copy_on_wd)) self.assertRaises(IOError, hdfs.cp, src_dir, copy_on_wd)
def __cp_file(self, wd): fn = "%s/fn" % wd hdfs.dump(self.data, fn) dest_dir = "%s/dest_dir" % wd hdfs.mkdir(dest_dir) fn_copy_on_wd = "%s/fn_copy" % wd hdfs.cp(fn, fn_copy_on_wd) self.assertEqual(hdfs.load(fn_copy_on_wd), self.data) self.assertRaises(IOError, hdfs.cp, fn, fn_copy_on_wd) fn_copy_on_dest_dir = "%s/fn" % dest_dir hdfs.cp(fn, dest_dir) self.assertEqual(hdfs.load(fn_copy_on_dest_dir), self.data) self.assertRaises(IOError, hdfs.cp, fn, dest_dir)
def __cp_file(self, wd): fn = "%s/fn" % wd hdfs.dump(self.data, fn, mode="wb") dest_dir = "%s/dest_dir" % wd hdfs.mkdir(dest_dir) fn_copy_on_wd = "%s/fn_copy" % wd hdfs.cp(fn, fn_copy_on_wd, mode="wb") self.assertEqual(hdfs.load(fn_copy_on_wd), self.data) self.assertRaises(IOError, hdfs.cp, fn, fn_copy_on_wd) fn_copy_on_dest_dir = "%s/fn" % dest_dir hdfs.cp(fn, dest_dir, mode="wb") self.assertEqual(hdfs.load(fn_copy_on_dest_dir), self.data) self.assertRaises(IOError, hdfs.cp, fn, dest_dir)
def cp(src_hdfs_path, dest_hdfs_path): """ Copy the contents of src_hdfs_path to dest_hdfs_path. If src_hdfs_path is a directory, its contents will be copied recursively. Source file(s) are opened for reading and copies are opened for writing. Args: :src_hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS). :dest_hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS). """ src_hdfs_path = _expand_path(src_hdfs_path) dest_hdfs_path = _expand_path(dest_hdfs_path) hdfs.cp(src_hdfs_path, dest_hdfs_path)
def __cp_recursive(self, wd): src_t = self.__make_tree(wd) src = src_t.name copy_on_wd = "%s_copy" % src src_bn, copy_on_wd_bn = [hdfs.path.basename(d) for d in (src, copy_on_wd)] hdfs.cp(src, copy_on_wd) for t in src_t.walk(): copy_name = t.name.replace(src_bn, copy_on_wd_bn) self.assertTrue(hdfs.path.exists(copy_name)) if t.kind == 0: self.assertEqual(hdfs.load(copy_name), self.data) hdfs.cp(src, copy_on_wd) for t in src_t.walk(): copy_name = t.name.replace(src_bn, "%s/%s" % (copy_on_wd_bn, src_bn)) self.assertTrue(hdfs.path.exists(copy_name)) if t.kind == 0: self.assertEqual(hdfs.load(copy_name), self.data)
def run(self): exit_code = 1 with tempfile.NamedTemporaryFile() as f: self.log.debug("opened scratch MR job input file %s", f.name) # We write the files to be compressed to a temporary file. Later we'll re-read # this temporary file to rename the files as well. I've opted not to keep the # table in memory in the hope of scaling better to jobs with a large number of # files (we reduce memory requirements). num_files = self.__write_mr_input(f) f.flush() self.log.debug("Finished writing temp input file") input_filename = tempfile.mktemp(dir=os.path.dirname(self.output_path), prefix="dist_txt_zipper_input") tmpfile_uri = "file://%s" % f.name try: self.log.debug("copying input from %s to %s", tmpfile_uri, input_filename) hdfs.cp(tmpfile_uri, input_filename) self.log.info("Run analyzed. Launching distributed job") # launch mr task pydoop_args = \ [ 'script', '--num-reducers', '0','--kv-separator', '', '-Dmapred.map.tasks=%d' % num_files, '-Dmapred.input.format.class=org.apache.hadoop.mapred.lib.NLineInputFormat', '-Dmapred.line.input.format.linespermap=1', '-Dmapred.output.compress=true', '-Dmapred.output.compression.codec=%s' % 'org.apache.hadoop.io.compress.GzipCodec', text_zipper_mr.__file__, input_filename, self.output_path] self.log.debug("pydoop_args: %s", pydoop_args) self.log.info("Compressing %s files", num_files) pydoop_app.main(pydoop_args) self.log.info("Distributed job complete") self.rename_compressed_files(f) self.log.info("finished") exit_code = 0 finally: try: self.log.debug("Removing temporary input file %s", input_filename) hdfs.rmr(input_filename) except IOError as e: self.log.warning("Problem cleaning up. Error deleting temporary input file %s", input_filename) self.log.exception(str(e)) return exit_code
def __cp_recursive(self, wd): src_t = self.__make_tree(wd) src = src_t.name copy_on_wd = "%s_copy" % src src_bn, copy_on_wd_bn = [ hdfs.path.basename(d) for d in (src, copy_on_wd) ] hdfs.cp(src, copy_on_wd) for t in src_t.walk(): copy_name = t.name.replace(src_bn, copy_on_wd_bn) self.assertTrue(hdfs.path.exists(copy_name)) if t.kind == 0: self.assertEqual(hdfs.load(copy_name), self.data) hdfs.cp(src, copy_on_wd) for t in src_t.walk(): copy_name = t.name.replace(src_bn, "%s/%s" % (copy_on_wd_bn, src_bn)) self.assertTrue(hdfs.path.exists(copy_name)) if t.kind == 0: self.assertEqual(hdfs.load(copy_name), self.data)
def cp(src_hdfs_path, dest_hdfs_path, overwrite=False): """ Copy the contents of src_hdfs_path to dest_hdfs_path. If src_hdfs_path is a directory, its contents will be copied recursively. Source file(s) are opened for reading and copies are opened for writing. Args: :src_hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS). :dest_hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS). :overwrite: boolean flag whether to overwrite destination path or not. """ src_hdfs_path = _expand_path(src_hdfs_path) dest_hdfs_path = _expand_path(dest_hdfs_path, exists=False) if overwrite and exists(dest_hdfs_path): # delete path since overwrite flag was set to true delete(dest_hdfs_path, recursive=True) hdfs.cp(src_hdfs_path, dest_hdfs_path)
def __cp_recursive(self, wd): src_t = self.__make_tree(wd) src = src_t.name copy_on_wd = "%s_copy" % src src_bn, copy_on_wd_bn = [ hdfs.path.basename(d) for d in (src, copy_on_wd) ] hdfs.cp(src, copy_on_wd, mode="wb") exp_t = self.__make_tree(wd, root=copy_on_wd_bn, create=False) for t, exp_t in czip(src_t.walk(), exp_t.walk()): self.assertTrue(hdfs.path.exists(exp_t.name)) if t.kind == 0: self.assertEqual(hdfs.load(exp_t.name), self.data) # check semantics when target dir already exists hdfs.rmr(copy_on_wd) hdfs.mkdir(copy_on_wd) hdfs.cp(src, copy_on_wd, mode="wb") exp_t = self.__make_tree(copy_on_wd, root=src_bn, create=False) for t, exp_t in czip(src_t.walk(), exp_t.walk()): self.assertTrue(hdfs.path.exists(exp_t.name)) if t.kind == 0: self.assertEqual(hdfs.load(exp_t.name), self.data)
def run(self): pydoop_exec = self.find_exec('pydoop') if pydoop_exec is None: raise RuntimeError("Can't find pydoop executable in PATH") with tempfile.NamedTemporaryFile() as f: num_records = self.__write_mr_input(f) f.flush() self.log.debug("Wrote temp input file %s", f.name) input_filename = tempfile.mktemp(dir=os.path.dirname(self.output_path), prefix="dist_bcl2qseq_input") tmpfile_uri = "file://%s" % f.name try: self.log.debug("copying input from %s to %s", tmpfile_uri, input_filename) hdfs.cp(tmpfile_uri, input_filename) self.log.info("Run analyzed. Launching distributed job") # launch mr task cmd = [ 'pydoop', 'script', '--num-reducers', '0', '--kv-separator', '', '-Dmapred.map.tasks=%d' % num_records, '-Dmapred.input.format.class=org.apache.hadoop.mapred.lib.NLineInputFormat', '-Dmapred.line.input.format.linespermap=1', bcl2qseq_mr.__file__, input_filename, self.output_path] self.log.debug(str(cmd)) subprocess.check_call(cmd) self.log.info("Distributed job complete") except subprocess.CalledProcessError as e: self.log.exception(e) self.log.error("Error running pydoop script component") raise finally: try: hdfs.rmr(input_filename) except IOError as e: self.log.debug("Problem cleaning up. Error deleting temporary input file %s", input_filename) self.log.debug(str(e))
def thread_allow(self): # test whether our code is properly allowing other python threads to # make progress while we're busy doing I/O class BusyCounter(Thread): def __init__(self): super(BusyCounter, self).__init__() self.done = False self._count = 0 @property def count(self): return self._count def run(self): while not self.done: self._count += 1 class BusyContext(object): def __init__(self): self.counter = None def __enter__(self): self.counter = BusyCounter() self.counter.start() def __exit__(self, _1, _2, _3): self.counter.done = True self.counter.join() @property def count(self): return self.counter.count some_data = "a" * (5 * 1024 * 1024) # 5 MB counter = BusyContext() ########################### acceptable_threshold = 5 # The tests were sometimes failing on TravisCI (slower machines) with # counts below 100. A test where we left the GIL locked showed that in # that case counter value doesn't change at all across calls, so in # theory even an increment of 1 would demonstrate that the mechanism is # working. # If the hdfs call doesn't release the GIL, the counter won't make any # progress during the HDFS call and will be stuck at 0. On the other # hand, if the GIL is release during the operation we'll see a count # value > 0. fs = hdfs.hdfs("default", 0) with fs.open_file(self.hdfs_paths[0], "w") as f: with counter: f.write(some_data) self.assertGreaterEqual(counter.count, acceptable_threshold) with fs.open_file(self.hdfs_paths[0], "r") as f: with counter: f.read() self.assertGreaterEqual(counter.count, acceptable_threshold) with counter: fs.get_hosts(self.hdfs_paths[0], 0, 10) self.assertGreaterEqual(counter.count, acceptable_threshold) with counter: fs.list_directory('/') self.assertGreaterEqual(counter.count, acceptable_threshold) with counter: hdfs.cp(self.hdfs_paths[0], self.hdfs_paths[0] + '_2') self.assertGreaterEqual(counter.count, acceptable_threshold) with counter: hdfs.rmr(self.hdfs_paths[0] + '_2') self.assertGreaterEqual(counter.count, acceptable_threshold)
def copy_and_compare(self, metadata, instance_guid, file_name): src_db_name = reg.db_name(metadata) src_db_table = reg.db_table(metadata, type='work') work_db_name = reg.db_name(metadata, stage="valid", type='work') work_db_table = reg.db_table(metadata, stage="valid", type='work') invalid_reason = {} if 'fileUpdateType' in metadata['file']['technical']: update_type = metadata['file']['technical']['fileUpdateType'] else: update_type = 'append' field_order = sorted(metadata['fields'], key=lambda k: k['position']) select_list = partition = '' partition_list = [] for field in field_order: # build the field list for the create statement if 'partitionPosition' in field: partition_list.append(field) else: # select_list += field["name"] + ', ' select_list += field_conversion(field) + ', ' now = time.time() logger.info('Comparison start') # dropping work table if it exists self._query('drop table if exists ' + work_db_name + '.' + work_db_table) self.create_hive_table(metadata, stage="valid", type='work') try: self.copy_table_data(metadata, instance_guid, copy_type='compare') except RuntimeError: invalid_reason[ "badMetadata"] = "Metadata didn't match file and caused hive to fail, check ingestion logs" except impala.dbapi.OperationalError: invalid_reason[ "badMetadata"] = "Metadata didn't match file and caused hive to fail, check ingestion logs" logger.info('Data loaded to validation table') compare = ('SELECT count(*) FROM ' + src_db_name + '.' + src_db_table + ' where ') for field in metadata['fields']: if str(field['datatype']).upper() in {'TIMESTAMP', 'DATE'}: compare += '((' + field["name"] + ' is not null and length(' + field["name"] + ') > 0) and ' +\ field_conversion(field) + ' is null) or ' elif str(field['datatype']).upper() in {'FLOAT'}: compare += '((' + field["name"] + ' is not null and length(' + field["name"] + ') > 0) and not(' +\ field_conversion(field) + ' <=> cast(' + field["name"] + ' as float))) or ' elif str(field['datatype']).upper() == 'BOOLEAN': compare += '( not(' + field_conversion( field) + ' <=> ' + field["name"] + ')) or ' elif str(field['datatype']).upper() == 'BINARY': pass else: compare += '((' + field["name"] + ' is not null and length(' + field["name"] + ') > 0) and not(' +\ field_conversion(field) + ' <=> ' + field["name"] + ')) or ' compare = compare[:-3] logger.info('comparison query : ' + compare) raw_rows = valid_rows = invalid_rows = 0 try: self._cur.execute(compare) except Exception as e: logger.info(e.message) raise logger.info('Valid data check query complete') # get the number of invalid rows from the comparison query for row in self._cur: invalid_rows = row[0] if invalid_rows > 0: logger.info(str(invalid_rows) + " invalid rows ") invalid_reason["datatypeMismatch"] = invalid_rows else: logger.info("All copied rows are valid") raw_rows = self.row_count(src_db_name + '.' + src_db_table) logger.info('Raw row count complete') valid_rows = self.row_count( work_db_name + '.' + work_db_table, 'instance_guid = "' + str(instance_guid) + '"') logger.info('Valid row count complete') logger.info("Raw rows = " + str(raw_rows) + ": Valid Rows = " + str(valid_rows)) if raw_rows - valid_rows != 0: logger.info("Mismatch count = " + str(raw_rows - valid_rows)) invalid_reason["rowCountMismatch"] = raw_rows - valid_rows logger.info("End copy and compare" + str(time.time())) logger.info("finished in " + str(time.time() - now) + " seconds") if len(invalid_reason) > 0: reg.register_invalid(metadata, instance_guid, file_name, invalid_reason, valid_rows, compare) else: # Adding append vs full file logic if 'fileUpdateType' in metadata['file']['technical']: if update_type == 'append': # this is the default path so we don't do anything logging.info('append file') pass elif update_type == 'full': # delete everything in the valid file location logger.info('Deleting existing data from valid table') if hdfs.path.exists(reg.file_path(metadata, stage="valid")): hdfs.rmr(reg.file_path(metadata, stage="valid")) elif update_type == 'delta': logging.info('delta file') self.delta(metadata) else: logging.info('update type blank, treating as append file') else: logging.info( 'no update type or update type null, treating as append file' ) self.create_hive_table(metadata, stage="valid") try: self.copy_table_data(metadata, instance_guid, valid_copy=True, update_type=update_type) reg.register_valid(metadata, instance_guid, file_name, valid_rows, compare) self._query('drop table if exists ' + work_db_name + '.' + work_db_table) if 'fileUpdateType' in metadata['file'][ 'technical'] and metadata['file']['technical'][ 'fileUpdateType'] == 'full': logger.info('Deleting existing data from raw table') if hdfs.path.exists(reg.file_path(metadata, stage="raw")): hdfs.rmr(reg.file_path(metadata, stage="raw")) hdfs.cp(reg.file_path(metadata, stage="raw", type='work'), reg.file_path(metadata, stage="raw")) except RuntimeError or impala.dbapi.OperationalError: invalid_reason["badMetadata"] = "Metadata didn't match file and " + \ "caused hive to fail, check ingestion logs" reg.register_invalid(metadata, instance_guid, file_name, invalid_reason, valid_rows, compare)
def thread_allow(self): # test whether our code is properly allowing other python threads to # make progress while we're busy doing I/O class BusyCounter(Thread): def __init__(self): super(BusyCounter, self).__init__() self.done = False self._count = 0 @property def count(self): return self._count def run(self): while not self.done: self._count += 1 class BusyContext(object): def __init__(self): self.counter = None def __enter__(self): self.counter = BusyCounter() self.counter.start() def __exit__(self, _1, _2, _3): self.counter.done = True self.counter.join() @property def count(self): return self.counter.count some_data = b"a" * (5 * 1024 * 1024) # 5 MB counter = BusyContext() ########################### acceptable_threshold = 5 # The tests were sometimes failing on TravisCI (slower machines) with # counts below 100. A test where we left the GIL locked showed that in # that case counter value doesn't change at all across calls, so in # theory even an increment of 1 would demonstrate that the mechanism is # working. # If the hdfs call doesn't release the GIL, the counter won't make any # progress during the HDFS call and will be stuck at 0. On the other # hand, if the GIL is release during the operation we'll see a count # value > 0. fs = hdfs.hdfs("default", 0) with fs.open_file(self.hdfs_paths[0], "w") as f: with counter: f.write(some_data) self.assertGreaterEqual(counter.count, acceptable_threshold) with fs.open_file(self.hdfs_paths[0], "r") as f: with counter: f.read() self.assertGreaterEqual(counter.count, acceptable_threshold) with counter: fs.get_hosts(self.hdfs_paths[0], 0, 10) self.assertGreaterEqual(counter.count, acceptable_threshold) with counter: fs.list_directory('/') self.assertGreaterEqual(counter.count, acceptable_threshold) with counter: hdfs.cp(self.hdfs_paths[0], self.hdfs_paths[0] + '_2', mode="wb") self.assertGreaterEqual(counter.count, acceptable_threshold) with counter: hdfs.rmr(self.hdfs_paths[0] + '_2') self.assertGreaterEqual(counter.count, acceptable_threshold)