def write(idir, odir, remove, check, verbose): "Write files from given input area into HDFS" if not os.path.isdir(idir): print("Source area %s does not exists" % idir) sys.exit(1) if not hdfs.path.isdir(odir): print("Destination area on HDFS %s does not exists" % odir) print("Create it first with the following command") print("hadoop fs -mkdir %s" % odir) sys.exit(1) for name in os.listdir(idir): fname = os.path.join(idir, name) if not (name.endswith('.avro') or \ name.endswith('.avro.gz') or \ name.endswith('.avro.bz2')): if verbose: print("Skip %s" % fname) continue oname = hdfs_file(odir, name) if not hdfs.path.isfile(oname): if verbose: print("Migrate %s to %s" % (fname, oname)) hdfs.put(fname, oname) if check: fsize = os.stat(fname).st_size osize = hdfs.stat(oname).st_size if fsize != osize: print("Size %s (%s) != %s (%s)" % (fname, fsize, oname, osize)) sys.exit(1) if remove: os.remove(fname)
def commit_file_compressed(srcfile, user_id, extension): dstfilename = get_filename(user_id, extension, create=True, hdfs_dest=USE_HDFS) with profiled("Uploading of output in %s"): # Atomic rename on POSIX log.msg("Renaming %s to %s" % (srcfile.name, dstfilename)) srcfile.close() # Race condition here? if USE_HDFS: if hdfs.path.exists(dstfilename): if hdfs.path.exists(dstfilename + '.new'): log.msg("Apparently a crashed worker left an unused file left") hdfs_handle.delete(dstfilename + '.new') hdfs.put(srcfile.name, dstfilename + '.new') hdfs_handle.delete(dstfilename) hdfs_handle.rename(dstfilename + '.new', dstfilename) else: hdfs.put(srcfile.name, dstfilename) os.unlink(srcfile.name) else: os.rename(srcfile.name, dstfilename)
def put(self): src = hdfs.path.split(self.local_paths[0])[-1] dest = self.hdfs_paths[0] with open(src, "w") as f: f.write(self.data) hdfs.put(src, dest) with hdfs.open(dest) as fi: rdata = fi.read() self.assertEqual(rdata, self.data)
def copy_file_2_remote_dir(remote_dir, log_file): LOGGER = logging.getLogger(__name__) suffix = time.strftime('%d-%m-%y_%H-%M-%S', time.gmtime(log_file.mtime)) dest_filename = os.path.join(remote_dir, "{0}-{1}".format(log_file.filename, suffix)) LOGGER.debug("Copying {0} to {1}".format(log_file.filepath, dest_filename)) hdfs.put(log_file.filepath, dest_filename) LOGGER.debug("Copied {0} to HDFS".format(log_file.filepath)) hdfs.chmod(dest_filename, BACKUP_PERMISSIONS) LOGGER.debug("Changed permissions for {0}".format(dest_filename))
def copy_from_local_to_hdfs(self,src_local_location,dest_local_location): if(dest_local_location==""): print "Not a valid hdfs path" return False elif os.path.exists(src_local_location)==True: hdfs.put(src_local_location, dest_local_location) return True else: print "Local destination does not exist" return False
def get_mr_options(opt, wd): mr_options = BASE_MR_OPTIONS.copy() if opt.exclude_fn: exclude_bn = os.path.basename(opt.exclude_fn) exclude_fn = hdfs.path.abspath(hdfs.path.join(wd, exclude_bn)) hdfs.put(opt.exclude_fn, exclude_fn) mr_options["mapred.cache.files"] = "%s#%s" % (exclude_fn, exclude_bn) mr_options["mapred.create.symlink"] = "yes" mr_options["ipcount.excludes"] = exclude_bn return mr_options
def set_input(self, input_, put=False): """ Set the input path for the job. If ``put`` is :obj:`True`, copy (local) ``input_`` to the working directory. """ if put and self.wd: self.logger.info("copying input data to HDFS") hdfs.put(input_, self.input) else: self.input = input_ self.logger.info("assigning input to %s", self.input)
def set_input(self, input_, put=False): """ Set the input path for the job. If ``put`` is :obj:`True`, copy (local) ``input_`` to the working directory. """ if put and self.wd: self.logger.info("copying input data to HDFS") hdfs.put(input_, self.input) else: self.input = input_ self.logger.info("assigning input to %s" % self.input)
def Get_stock_ticks(code, time_to_market): import tushare as ts import pandas as pd import logging import datetime as dt import os import socket import pydoop.hdfs as hdfs import shutil if time_to_market != 0: logger = logging.getLogger("D_stock") logger_handler = logging.FileHandler("/tmp/D_stock.log") logger_handler.setFormatter( logging.Formatter("%(asctime)s -- %(message)s")) logger_handler.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG) logger.addHandler(logger_handler) logger.info(">" * 15 + code + ">" * 15) all_days = pd.date_range(start=str(time_to_market), end=dt.date.today(), freq="B") all_days = [x.date() for x in all_days] for day in all_days[::-1]: logger.info("Saving " + code + "@" + str(day) + "...") while True: try: df = ts.get_tick_data(code, date=day) except Exception as e: print e continue break if df.index.size > 3: dir_name = "/tmp/ticks/" + str(code) if not os.path.exists(dir_name): os.makedirs(dir_name) file_name = dir_name + "/" + str(day) + ".csv" df.to_csv(file_name) """ Write to HDFS """ if os.path.exists(dir_name): s = hdfs.hdfs(host="spark-1", port=9000) if not s.exists("ticks"): s.create_directory("ticks") hdfs.put(dir_name, "./ticks/") shutil.rmtree(dir_name) logger.info("<" * 15 + code + "<" * 15) return (socket.gethostname(), code)
def saveStockFile(self, stock_name, data_frame, hdfs_path): print("saving stock " + stock_name) outFile = self.fileLocalOutput + stock_name + ".csv" export_csv = data_frame.to_csv(outFile, index=None, header=True) print(outFile) from_path = outFile if (hdfs_path != ""): print(hdfs_path) #to_path ='hdfs://localhost:9000/user/xavier/US_Stocks/'+stock_name+'.csv' to_path = hdfs_path + stock_name + '.csv' print(from_path + "==>" + to_path) hdfs.put(from_path, to_path) os.remove(outFile)
def Get_stock_ticks(code, time_to_market): import tushare as ts import pandas as pd import logging import datetime as dt import os import socket import pydoop.hdfs as hdfs import shutil if time_to_market !=0: logger = logging.getLogger("D_stock") logger_handler=logging.FileHandler("/tmp/D_stock.log") logger_handler.setFormatter(logging.Formatter("%(asctime)s -- %(message)s")) logger_handler.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG) logger.addHandler(logger_handler) logger.info(">"*15+code+">"*15) all_days=pd.date_range(start=str(time_to_market),end=dt.date.today(),freq="B") all_days=[x.date() for x in all_days] for day in all_days[::-1]: logger.info("Saving "+code+"@"+str(day)+"...") while True: try: df=ts.get_tick_data(code,date=day) except Exception as e: print e continue break if df.index.size >3: dir_name="/tmp/ticks/"+str(code) if not os.path.exists(dir_name): os.makedirs(dir_name) file_name=dir_name+"/"+str(day)+".csv" df.to_csv(file_name) """ Write to HDFS """ if os.path.exists(dir_name): s=hdfs.hdfs(host="spark-1",port=9000) if not s.exists("ticks"): s.create_directory("ticks") hdfs.put(dir_name,"./ticks/") shutil.rmtree(dir_name) logger.info("<"*15+code+"<"*15) return (socket.gethostname(),code)
def upsert_a_folder(src_dir, hdfs_tgt_dir, filename, debug): src_fname = os.path.join(src_dir, filename) tgt_fname = os.path.join(hdfs_tgt_dir, filename) # get target file info tgt_dict = {} try: lsl = hdfs.lsl(hdfs_tgt_dir) for i in lsl: try: tgt_dict[os.path.basename(i["name"])] = i["last_mod"] except: pass except: pass print "hdfs tgt_dict=", tgt_dict # get source info src_fs = glob.glob(src_fname) print "src_fs=", src_fs for sf in src_fs: # get source file info try: src_ctime_int = int(os.path.getctime(sf)) except: src_ctime_int = None print "src_ctime_int=", src_ctime_int src_bfname = os.path.basename(sf) tgt_fname = os.path.join(hdfs_tgt_dir, src_bfname) # put or rm/put try: if not src_bfname in tgt_dict: #insert new one if debug == 'N': hdfs.put(sf, hdfs_tgt_dir) else: print "DEBUG: put ", src_bfname, "to", hdfs_tgt_dir elif src_ctime_int > tgt_dict[src_bfname]: if debug == 'N': hdfs.rmr(tgt_fname) hdfs.put(sf, hdfs_tgt_dir) else: print "DEBUG: replace ", tgt_fname, "by", sf else: print tgt_fname, "has a newer mdate than", sf, ":", src_ctime_int except: e = sys.exc_info()[0] print "Error: ", e
def copy_to_hdfs(local_path, relative_hdfs_path, overwrite=False, project=None): """ Copies a path from local filesystem to HDFS project (recursively) using relative path in $CWD to a path in hdfs (hdfs_path) For example, if you execute: >>> copy_to_hdfs("data.tfrecords", "/Resources", project="demo") This will copy the file data.tfrecords to hdfs://Projects/demo/Resources/data.tfrecords Args: :local_path: Absolute or local path on the local filesystem to copy :relative_hdfs_path: a path in HDFS relative to the project root to where the local path should be written :overwrite: a boolean flag whether to overwrite if the path already exists in HDFS :project: name of the project, defaults to the current HDFS user's project """ if project == None: project = project_name() if "PDIR" in os.environ: full_local = os.environ['PDIR'] + '/' + local_path else: # Absolute path if local_path.startswith(os.getcwd()): full_local = local_path else: # Relative path full_local = os.getcwd() + '/' + local_path hdfs_path = _expand_path(relative_hdfs_path, project, exists=False) if overwrite: hdfs_path = hdfs_path + "/" + os.path.basename(full_local) if exists(hdfs_path): # delete hdfs path since overwrite flag was set to true delete(hdfs_path, recursive=True) print("Started copying local path {} to hdfs path {}\n".format( local_path, hdfs_path)) # copy directories from local path to HDFS project path hdfs.put(full_local, hdfs_path) print("Finished copying\n")
def word_count(): wordDict = defaultdict(int) filename = open("/home/marcus/tasks/Shakespeare.txt","r").read() filename = filename.lower() for ch in '"''!@#$%^&*()-_=+,<.>/?;:[{]}~`\|': filename = filename.replace(ch," ") for word in filename.split(): if word not in wordDict: wordDict[word] = 1 else: wordDict[word] = wordDict[word] + 1 #print(wordDict["the"]) with open('/home/marcus/Mindbender_BD/task2/python_output.txt', 'w') as file: file.write(json.dumps(wordDict)) from_path = "/home/marcus/Mindbender_BD/task2/python_output.txt" to_path ='hdfs://localhost:9000/task2/outfile.txt' hdfs.put(from_path, to_path)
def setup(self): """ * Creates an hdfs directory with the name of this test (self.make_hdfs_test_path()) * uploads the local 'input' directory into the hdfs directory """ self.logger.debug("Test setup") #hadut.run_hadoop_cmd_e("dfsadmin", args_list=["-safemode", "wait"]) #self.logger.debug("hdfs out of safe mode") if hdfs.path.exists(self.make_hdfs_test_path()): error_msg = "hdfs test path '%s' already exists. Please remove it" % self.make_hdfs_test_path() self.logger.fatal(error_msg) raise RuntimeError(error_msg) hdfs.mkdir(self.make_hdfs_test_path()) local_input = self.make_local_input_path() hdfs_input = self.make_hdfs_input_path() hdfs.put(local_input, hdfs_input) self.logger.info("Copied local input %s to %s", local_input, hdfs_input) self.logger.debug("Setup complete")
def copy_to_hdfs(local_path, relative_hdfs_path, overwrite=False, project=None): """ Copies a path from local filesystem to HDFS project (recursively) using relative path in $CWD to a path in hdfs (hdfs_path) For example, if you execute: >>> copy_to_hdfs("data.tfrecords", "/Resources/", project="demo") This will copy the file data.tfrecords to hdfs://Projects/demo/Resources/data.tfrecords Args: :local_path: the path on the local filesystem to copy :relative_hdfs_path: a path in HDFS relative to the project root to where the local path should be written :overwrite: a boolean flag whether to overwrite if the path already exists in HDFS :project: name of the project, defaults to the current HDFS user's project """ if project == None: project = project_name() if "PDIR" in os.environ: full_local = os.environ['PDIR'] + '/' + local_path else: full_local = os.getcwd() + '/' + local_path hdfs_path = _expand_path(relative_hdfs_path, project, exists=False) if overwrite: hdfs_handle = get() split = local_path.split('/') filename = split[len(split) - 1] if filename == '/': filename = split[len(split) - 2] full_project_path = hdfs_path + '/' + filename # check if project path exist, if so delete it (since overwrite flag was set to true) if hdfs_handle.exists(full_project_path): hdfs_handle.delete(full_project_path, recursive=True) # copy directories from local path to HDFS project path hdfs.put(full_local, hdfs_path)
def capture(outpath, max_count='3'): """ fab cam.capture:/tmp/cam1,3 """ max_count = int(max_count) import os import cv2 import copy import pydoop.hdfs as hdfs cv2.namedWindow('Window1') vc = cv2.VideoCapture() vc.open(0) skip = 50 max_count *= skip basename = os.path.basename(outpath) count = 1 hdfs.mkdir('hdfs://gnn-f02-01' + outpath) while True: retval, image = vc.read() try: if count % skip == 0: tmpImage = copy.copy(image) filename = '%05d.jpg' % (count / skip) hdfspath = 'hdfs://gnn-f02-01%(outpath)s/%(filename)s' % locals( ) cv2.putText(tmpImage, filename, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 2, 2) cv2.imshow('Windows1', tmpImage) cv2.waitKey(1) cv2.imwrite(basename + '_' + filename, image) hdfs.put(basename + '_' + filename, hdfspath) print basename + '_' + filename, hdfspath else: cv2.imshow('Windows1', image) cv2.waitKey(1) except KeyboardInterrupt: break count += 1 if 0 < max_count < count: break vc.release() cv2.destroyWindow('Window1')
def capture(outpath, max_count='3'): """ fab cam.capture:/tmp/cam1,3 """ max_count = int(max_count) import os import cv2 import copy import pydoop.hdfs as hdfs cv2.namedWindow('Window1') vc = cv2.VideoCapture() vc.open(0) skip = 50 max_count *= skip basename = os.path.basename(outpath) count = 1 hdfs.mkdir('hdfs://gnn-f02-01' + outpath) while True: retval, image = vc.read() try: if count % skip == 0: tmpImage = copy.copy(image) filename = '%05d.jpg' % (count / skip) hdfspath = 'hdfs://gnn-f02-01%(outpath)s/%(filename)s' % locals() cv2.putText(tmpImage, filename, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 2, 2) cv2.imshow('Windows1', tmpImage) cv2.waitKey(1) cv2.imwrite(basename + '_' + filename, image) hdfs.put(basename + '_' + filename, hdfspath) print basename + '_' + filename, hdfspath else: cv2.imshow('Windows1', image) cv2.waitKey(1) except KeyboardInterrupt: break count += 1 if 0 < max_count < count: break vc.release() cv2.destroyWindow('Window1')
def __setup_remote_paths(self): """ Actually create the working directory and copy the module into it. Note: the script has to be readable by Hadoop; though this may not generally be a problem on HDFS, where the Hadoop user is usually the superuser, things may be different if our working directory is on a shared POSIX filesystem. Therefore, we make the directory and the script accessible by all. """ pipes_code = self.__generate_pipes_code() hdfs.mkdir(self.remote_wd) hdfs.chmod(self.remote_wd, "a+rx") hdfs.dump(pipes_code, self.remote_exe) hdfs.chmod(self.remote_exe, "a+rx") hdfs.put(self.args.module, self.remote_module) hdfs.chmod(self.remote_module, "a+r") self.__warn_user_if_wd_maybe_unreadable(self.remote_wd) self.logger.debug("Created remote paths:") self.logger.debug(self.remote_wd) self.logger.debug(self.remote_exe) self.logger.debug(self.remote_module)
def run_task(factory, port=None, istream=None, ostream=None, private_encoding=True, context_class=TaskContext, cmd_file=None, fast_combiner=False, auto_serialize=True): """ Run the assigned task in the framework. :rtype: bool :return: :obj:`True` if the task succeeded. """ connections = resolve_connections( port, istream=istream, ostream=ostream, cmd_file=cmd_file, auto_serialize=auto_serialize ) context = context_class(connections.up_link, private_encoding=private_encoding, fast_combiner=fast_combiner) stream_runner = StreamRunner(factory, context, connections.cmd_stream) pstats_dir = os.getenv(PSTATS_DIR) if pstats_dir: pstats_fmt = os.getenv(PSTATS_FMT, DEFAULT_PSTATS_FMT) hdfs.mkdir(pstats_dir) fd, pstats_fn = tempfile.mkstemp(suffix=".pstats") os.close(fd) cProfile.runctx("stream_runner.run()", {"stream_runner": stream_runner}, globals(), filename=pstats_fn) name = pstats_fmt % ( "r" if context.is_reducer() else "m", context.get_task_partition(), os.path.basename(pstats_fn) ) hdfs.put(pstats_fn, hdfs.path.join(pstats_dir, name)) else: stream_runner.run() context.close() connections.close() return True
def upsert_a_file(src_dir, hdfs_tgt_dir, filename, debug): src_fname = os.path.join(src_dir, filename) tgt_fname = os.path.join(hdfs_tgt_dir, filename) # get source file info try: src_ctime_int = int(os.path.getctime(src_fname)) except: src_ctime_int = None print "src_ctime_int=", src_ctime_int # get target file info try: tgt_stat = hdfs.stat(tgt_fname) tgt_mtime = tgt_stat.st_mtime except: tgt_mtime = None print "tgt_mtime=", tgt_mtime # put or rm/put try: if tgt_mtime is None: #insert new one if debug == 'N': hdfs.put(src_fname, hdfs_tgt_dir) else: print "DEBUG: put ", src_fname, "to", hdfs_tgt_dir elif src_ctime_int > tgt_mtime: if debug == 'N': hdfs.rmr(tgt_fname) hdfs.put(src_fname, hdfs_tgt_dir) else: print "DEBUG: replace ", tgt_fname, "by", src_fname else: print tgt_fname, "has a newer mdate:", tgt_mtime, "than", src_fname, ":", src_ctime_int except: e = sys.exc_info()[0] print "Error({0}): {1}".format(e.errno, e.strerror)
def main(file_name, **kwargs): # check each file against the registry # determine if its spam, a duplicate, or should be ingested # copy the file to the appropriate place, check for equal file size, and delete one of the files appropriately metadata = {} write_path = instance_guid = stage = header = "" file_type = "raw" # json.dumps(metadata) # Log the PID to help in debugging logger.info('Pid : ' + str(os.getpid())) try: # attempt to get the registry entry. If Alfred isn't working properly we'll get a connection error if file_name.startswith('sbx_'): # asking the registry for sandbox file and stripping "sbx_" off the file name, keys don't have the prefix metadata = reg.get_metadata(file_name[4:], stage='sandbox') else: metadata = reg.get_metadata(file_name) except requests.ConnectionError as e: # log response error logger.error('Failed to connect to Alfred : ' + str(e)) exit(e) if 'stage' in metadata: stage = metadata['stage'] # get the count of the number of rows in the source file row_count = file_len(landing_zone + '/' + file_name) logger.info('row count = ' + str(row_count)) if 'file' in metadata and metadata['file'] != {}: # a registry entry exists for the file, process it logger.info("Moving " + file_name + " to hdfs://" + reg.file_path(metadata, **kwargs)) # set the write path based on the metadata write_path = reg.file_path(metadata, **kwargs) logger.info("Moving " + file_name + " to " + write_path) if stage == 'sandbox' and hdfs.path.exists(write_path + '/' + file_name): # in the case of sandbox files previous data is always overwritten logging.info("Sandbox file already exists, overwriting") # Delete from HDFS is not strictly needed if the table was created as external hdfs.rmr(write_path + '/' + file_name) # set up a hive connection hive = validator.Hive() # use the hive connection to delete the sandbox table hive.drop_table(metadata, stage=stage) # close the hive connection hive = None # check to make sure the file doesn't already exist if not hdfs.path.exists(write_path + '/' + file_name): # if it doesn't, write it to the appropriate location hdfs.put(landing_zone + '/' + file_name, write_path + '/' + file_name) # create second copy for work table unless its a sandbox file if stage != 'sandbox': # create work copy write path work_write_path = reg.file_path(metadata, type='work', **kwargs) # delete the work file if there is already one present if hdfs.path.exists(work_write_path): logger.info("Deleting existing work files at " + work_write_path) hdfs.rmr(work_write_path) # write the file to the work file location hdfs.put(landing_zone + '/' + file_name, work_write_path + '/' + file_name) else: # if this is a sandbox file, we might need the header row, its far easier to get this now than from hdfs header = get_header(file_name) # register that the raw file was written instance_guid = reg.register_raw(metadata, file_name, file_type, row_count) else: # if the file does exist, its treated as a duplicate logger.info("Duplicate file") file_type = "duplicate" # set up duplicate write path write_path = reg.dup_file_path( metadata) # + '/' + metadata['file']['key'] #check to see if its a duplicate of an existing duplicate if hdfs.path.exists(write_path + '/' + file_name): # delete existing duplicate and write the new one. logging.info("duplicate file already exists, overwriting") hdfs.rmr(write_path + '/' + file_name) hdfs.put(landing_zone + '/' + file_name, write_path + '/' + file_name) logger.info("writing duplicate file " + write_path + '/' + file_name) reg.register_raw(metadata, file_name, file_type, row_count) else: # first time duplicates just get written hdfs.put(landing_zone + '/' + file_name, write_path + '/' + file_name) logger.info("writing duplicate file " + write_path + '/' + file_name) reg.register_raw(metadata, file_name, file_type, row_count) else: # no registry entry for this file, move it to spam file_type = "spam" # set up write path for spam write_path = reg.spam_file_path(metadata) logger.info("Moving " + file_name + " to " + write_path + '/' + file_name) #check to see if its a duplicate of an existing spam file if hdfs.path.exists(write_path + '/' + file_name): # delete existing spam and write the new one. logging.info("spam file already exists, overwriting") hdfs.rmr(write_path + '/' + file_name) hdfs.put(landing_zone + '/' + file_name, write_path + '/' + file_name) logger.info("writing spam file " + write_path + '/' + file_name) reg.register_raw(metadata, file_name, file_type, row_count) else: # first time spam gets written as normal hdfs.put(landing_zone + '/' + file_name, write_path + '/' + file_name) logger.info("writing spam file " + write_path + '/' + file_name) reg.register_raw(metadata, file_name, file_type, row_count) # confirm that source file and target file have the same size, regardless of spam, duplicate or normal if hdfs.path.exists(write_path + '/' + file_name) and \ hdfs.path.getsize(write_path + '/' + file_name) == os.stat(landing_zone + '/' + file_name).st_size: # if the file sizes match, delete the source file os.remove(landing_zone + '/' + file_name) logger.info("Landing zone file removed " + landing_zone + '/' + file_name) else: # if the file sizes do not match, delete the target file and rename the source file so it doesn't get reprocessed repeatedly logger.error( "Source and target file sizes didn't match, not deleting source.") hdfs.rmr(write_path + '/' + file_name) os.rename(landing_zone + '/' + file_name, landing_zone + '/' + file_name + '.err') raise ValueError("Source and target file sizes don't match") # copy only is an option set up in case there's ever a reason not to process beyond moving the file to HDFS if 'copy_only' not in kwargs or not kwargs['copy_only']: if file_type == "raw": # raw, meaning not spam or duplicate. No reason to validate those if stage != 'sandbox': # if its not a sandbox file proceed with full validation logger.info("Validate " + file_name) validator.main(file_name, instance_guid, metadata) elif stage == 'sandbox': # if it is a sandbox file, we need to mark it as such so validator only creates the table logger.info("Sandbox validate " + file_name) validator.main(file_name, instance_guid, metadata, header=header, stage=stage) # log that this PID is ending logger.info('Pid ending : ' + str(os.getpid()))
# Create a set to exclude punctuations exclude = set(string.punctuation) # For each string in the text file, remove unwanted character for i in txtFile: i = ''.join(ch for ch in i if ch not in exclude) newlist.append(i) # Convert all words in the list to lower case wordlist = [w.lower() for w in newlist] dic = dict() for word in wordlist: if word in dic: dic[word] += 1 else: dic[word] = 1 dic result = sorted(dic.items(), key=lambda x: x[1]) with open("sample.txt", "w") as outfile: json.dump(result, outfile) file_path = "/home/user/sample.txt" finalFile = "hdfs://localhost:9000/Test_002" hdfs.put(file_path, finalFile)
def put(self, source, destination): hdfs.put(source, destination)
import os import glob directoryPath = 'data/' for file_name in glob.glob(directoryPath + '*.csv'): print(file_name) arr = file_name.split('/') fname = arr[1] b = hdfs.path.isdir("/data") if b == True: hdfs_client = hdfs.hdfs() data_list = hdfs_client.list_directory("/data") print(data_list) for item in data_list: print(item["name"]) if fname in item["name"]: print("rm -->", item["name"]) hdfs.rm(item["name"], recursive=True, user=None) print("---after rm ---") data_list = hdfs_client.list_directory("/data") print(data_list) hdfs.put(file_name, "/data") print("---after put ---") data_list = hdfs_client.list_directory("/data") print(data_list)
def main(argv): logger = logging.getLogger("main") logger.setLevel(logging.DEBUG) with Timer() as total_time: parser = make_parser() args = parser.parse_args(argv) if args.dataset: print args.dataset create_dataset(logger, args.dataset) if args.script: piped_code_file = args.script else: piped_code_file = DEFAULT_SCRIPT if not os.path.exists(piped_code_file): raise IOError("script {0} not found !!!".format(piped_code_file)) with open(piped_code_file) as f: pipes_code = pts.add_sys_path(f.read()) dataset = [d for d in os.listdir("dataset") if d.endswith("MB")] dataset.sort(cmp=lambda x, y: cmp( int(x.replace("MB", "")), int(y.replace("MB", "")) )) logger.info(" Uploading dataset: { %s }", ', '.join(dataset)) if not hadut.path_exists(os.path.join(DATASET_DIR)): logger.info(" dataset folder created") hdfs.mkdir(DATASET_DIR) for data_filename in dataset: source_path = os.path.join(DATASET_DIR, data_filename) dest_path = os.path.join(DATASET_DIR, data_filename) if not hadut.path_exists(os.path.join(DATASET_DIR, data_filename)): logger.info(" -> uploading %s...", source_path) hdfs.put(source_path, dest_path) update_conf(args) results = dict() for data_input in dataset: with Timer() as t: runner = hadut.PipesRunner(prefix=PREFIX, logger=logger) logger.info("Running the script %s with data input %s..", piped_code_file, data_input) data_input_path = os.path.join(DATASET_DIR, data_input) runner.set_input(data_input_path, put=False) runner.set_exe(pipes_code) runner.run(properties=CONF, hadoop_conf_dir=HADOOP_CONF_DIR, logger=logger) res = runner.collect_output() print data_input_path local_wc = pts.LocalWordCount(data_input_path) logging.info(local_wc.check(res)) #print res #runner.clean() results[data_input] = (t.secs, t.msecs) print "\n\n RESULTs" print "=" * (len(piped_code_file) + 15) print " * script: {0}".format(piped_code_file) print " * mappers: {0}".format(CONF["mapred.map.tasks"]) print " * reducers: {0}".format(CONF["mapred.reduce.tasks"]) print " * dataset: [{0}]".format(",".join(dataset)) print " * times (input -> secs):" for data_input in dataset: print " - {0} -> {1} secs.".format( data_input, results[data_input][0] ) print "\n => Total execution time: {0}".format(total_time.secs) print "=" * (len(piped_code_file) + 15) print "\n"
def out_to_dfs(file, dfs_path): print("Writing file to HDFS...") hdfs.put(file, dfs_path)
def main(): # ============= ============= ============= ============= # parse arguments parser = ArgumentParser(description=__description__) args = arg_parser(parser) if args.in_zipfname: in_zipfname = args.in_zipfname else: in_zipfname = 'data_test.zip' if args.outdir: outdir = args.outdir else: outdir = 'out' if args.outfname: outfname = args.outfname else: outfname = 'outfname' if args.dfs_folder: dfs_folder = args.dfs_folder else: dfs_folder = None if args.row_id: row_id_str = args.row_id else: row_id_str = "0" # log time ================================================================ ================ t0 = time() print "outdir=", outdir # get input zip file name outfname = os.path.basename(in_zipfname) # get file name and its extension suffix (root, ext) = os.path.splitext(outfname) # set output file name outfname = root + ".gz" # get output file handle zout = create_zfile(outdir, outfname) # input file is .zip file with folders inside zin = zipfile.ZipFile(in_zipfname, "r") count = 0 folder_list = [] # open each file in .zip file, parse it and save to .gz file #for filename in zin.namelist(): for info in zin.infolist(): # get filename from zin filename = info.filename #dd=info.date_time #print "f=",filename,", dt=",datetime.datetime(*dd) meta_list = [] # get folder name folder, name = filename.split('/') #print "--",folder," --",filename # assume first level folders are labels # collect labels if not folder in folder_list: folder_list.append(folder) # transform here ====================== if len(name) > 0: # exclude folder name content = zin.read(filename) if len(content) <= 0: print "Content not found for [" + filename + "]" else: # count files count = count + 1 # label meta_list.append(folder) # md5; assume file name is md5 bname = os.path.basename(filename) (namep, ext) = os.path.splitext(bname) meta_list.append(namep) # date of file meta_list.append(str(datetime.datetime(*info.date_time))) #print "meta_list=",meta_list #print "content=",len(content)," type=",type(content) #print "bname=",bname # for zip file; write to different files #zout.writestr(bname, format_content(meta_list, content)) # write to .gz file; zout.write(format_content(meta_list, content)) # allow 100 samples in .gz file; create the other file if count % 100 == 0: zout.close() outfname = root + "_" + str(count) + ".gz" zout = create_zfile(outdir, outfname) zout.close() zin.close() #print "folder_list=",folder_list #upload to HDFS if dfs_folder: # clean up folder dfs_folder = os.path.join(HDFS_RETR_DIR, dfs_folder) print "dfs_folder=", dfs_folder try: hdfs.rmr(dfs_folder) except: e = sys.exc_info()[0] print "Warning: delete hdfs error: ", e pass try: hdfs.put(outdir, dfs_folder) except: e = sys.exc_info()[0] print "Error: Put files error.", e t1 = time() print 'running time: %f' % (t1 - t0) return 0 '''
def main(argv): logger = logging.getLogger("main") logger.setLevel(logging.DEBUG) with Timer() as total_time: parser = make_parser() args = parser.parse_args(argv) if args.dataset: print args.dataset create_dataset(logger, args.dataset) if args.script: piped_code_file = args.script else: piped_code_file = DEFAULT_SCRIPT if not os.path.exists(piped_code_file): raise IOError("script {0} not found !!!".format(piped_code_file)) with open(piped_code_file) as f: pipes_code = pts.add_sys_path(f.read()) dataset = [d for d in os.listdir("dataset") if d.endswith("MB")] dataset.sort(cmp=lambda x, y: cmp(int(x.replace("MB", "")), int(y.replace("MB", "")))) logger.info(" Uploading dataset: { %s }", ', '.join(dataset)) if not hadut.path_exists(os.path.join(DATASET_DIR)): logger.info(" dataset folder created") hdfs.mkdir(DATASET_DIR) for data_filename in dataset: source_path = os.path.join(DATASET_DIR, data_filename) dest_path = os.path.join(DATASET_DIR, data_filename) if not hadut.path_exists(os.path.join(DATASET_DIR, data_filename)): logger.info(" -> uploading %s...", source_path) hdfs.put(source_path, dest_path) update_conf(args) results = dict() for data_input in dataset: with Timer() as t: runner = hadut.PipesRunner(prefix=PREFIX, logger=logger) logger.info("Running the script %s with data input %s..", piped_code_file, data_input) data_input_path = os.path.join(DATASET_DIR, data_input) runner.set_input(data_input_path, put=False) runner.set_exe(pipes_code) runner.run(properties=CONF, hadoop_conf_dir=HADOOP_CONF_DIR, logger=logger) res = runner.collect_output() print data_input_path local_wc = pts.LocalWordCount(data_input_path) logging.info(local_wc.check(res)) # print res # runner.clean() results[data_input] = (t.secs, t.msecs) print "\n\n RESULTs" print "=" * (len(piped_code_file) + 15) print " * script: {0}".format(piped_code_file) print " * mappers: {0}".format(CONF["mapred.map.tasks"]) print " * reducers: {0}".format(CONF["mapred.reduce.tasks"]) print " * dataset: [{0}]".format(",".join(dataset)) print " * times (input -> secs):" for data_input in dataset: print " - {0} -> {1} secs.".format(data_input, results[data_input][0]) print "\n => Total execution time: {0}".format(total_time.secs) print "=" * (len(piped_code_file) + 15) print "\n"
def send_file(file): print("Saving to HDFS") dest = 'hdfs://localhost:9000/Task-002/python_output.txt' hdfs.put(file, dest) print("Saved to HDFS")
for subdir, dirs, files in os.walk(localInputDirPath): dirs.sort() files.sort() for file in files: filePath = os.path.join(subdir, file) if (filePath.endswith(('.jpg', '.tiff', '.tif', '.png', '.JPG', '.TIFF', '.TIF', '.PNG')) and os.path.getsize(filePath) > 0): flattenedPath = subdir.replace("/", "_") if flattenedPath.startswith('_'): flattenedPath = flattenedPath[1:] hdfsFileName = flattenedPath + file hdfs_path = hdfsOutputDirPath + hdfsFileName try: hdfs.put(filePath, hdfs_path) imageCount += 1 print '[' + str( imageCount ) + '] file: ' + hdfsFileName + ' ===> ' + hdfs_path + ' Size = ' + str( os.path.getsize(filePath)) #os.remove(filePath) except IOError: #os.remove(filePath) continue print '=======================================================================' print '= SUCCESS: All files successifully moved from local folder to HDFS =' print '======================================================================='
#coding:UTF-8 import pydoop.hdfs import pydoop.hdfs as hdfs from_path = '/tmp/cctv/abc.txt' to_path = 'hdfs://localhost:22/tmp/outfile.txt' hdfs.put(from_path, to_path)
if b == True: hdfs_client = hdfs.hdfs() data_list = hdfs_client.list_directory('/data') print(data_list) for item in data_list: print(item['name']) if '2020-12-28_generated_demo.csv' in item['name']: print('rm -->', item['name']) hdfs.rm(item['name'], recursive=True, user=None) print('---after rm ---') data_list = hdfs_client.list_directory('/data') print(data_list) print('---get test ---') lines = [] with hdfs.open('hdfs://127.0.0.1:9000/data/source_demo.csv') as f: for line in f: # print(line, type(line)) l = line.decode('utf-8') if '2020-11-15' in l: lines.append(l) print(lines) print('---end get----') hdfs.put('2021-02-09_generated_demo.csv', '/data') print('---after put ---') data_list = hdfs_client.list_directory('/data') print(data_list)
import pydoop.hdfs as hdfs import pydoop from datetime import datetime print datetime.now().time() pydoop.hdfs.hdfs(host='default', port=0, user=None, groups=None) hdfs.mkdir('NEWS ARTICLES') # hdfs.put('/home/hduser1/PycharmProjects/Crawler/NEWS.csv', 'NEWS ARTICLES/NEWS.csv') var = hdfs.mkdir('ALL UBL DATA') print var hdfs.mkdir('ALL HBL DATA') hdfs.mkdir('ALL OGDCL DATA') hdfs.mkdir('ALL ENGRO DATA') hdfs.mkdir('ALL PSO DATA') hdfs.mkdir('MISC') hdfs.put('/home/hduser1/PycharmProjects/Crawler/Unwanted Stuff', 'MISC/Unwanted') hdfs.put('/home/hduser1/PycharmProjects/Crawler/kse 100', 'HISTORICAL/') hdfs.hdfs.delete("/user/hduser1/HISTORICAL", recursive=True)
import re from pydoop import hdfs counts = dict() with open('/home/field/Desktop/Shakespeare.txt') as f: File = f.read().split() for items in File: items = items.lower() items = ''.join(re.findall('[a-zA-Z0-9@\s]+', items)) if items in counts: counts[items] += 1 else: counts[items] = 1 counts = sorted(counts.items(), key=lambda kv: (kv[1], kv[0])) print(counts) with open('listfile.txt', 'w') as f: for listitem in counts: f.write(" ".join(str(s) for s in listitem) + '\n') hdfs_path = "hdfs://localhost:9000/SPtext/PythonResult.txt" hdfs.put("listfile.txt", hdfs_path)
def convertGRIBs(aws_key, aws_secret_key, numprocesses, myremainder, compressed_flag = True, output_to_S3_flag = False): tempdir = "/mnt3/ubuntu" # for r3.8xlarge instances, assumes this is linked to one of the SSDs #tempdir = "/tmp" # for other instances conn = S3Connection(aws_key, aws_secret_key) # source of the CFSRO-O data, as a set of grb2 files bucket = conn.get_bucket('agittens') keys = bucket.list(prefix='CSFR-O/grib2/ocnh06.gdas') # should manually enforce a sorting on these so you know the explicit map between the record number and a particular sample observation # make these vectors global because they're huge, so don't want to reallocate them dimsperlevel = 360*720 dims = dimsperlevel * 41 vals = np.zeros((dims,)) mask = np.zeros((dims,)) < 0 # returns the set of gribs from an s3 key tempgribfname = tempdir + '/temp{0}'.format(myremainder) def get_grib_from_key(inkey): with open(tempgribfname, 'w') as fin: inkey.get_file(fin) return pygrib.open(tempgribfname.format(myremainder)) # index of the gribs within the grib file that correspond to SST and sub surface sea temperatures gribindices = range(1,41) gribindices.append(207) gribindices = list(reversed(gribindices)) # for a given set of gribs, extracts the desired temperature observations and converts them to a vector of # observations and drops missing observations def converttovec(grbs): for index in range(41): maskedobs = grbs[gribindices[index]].data()[0] vals[index*dimsperlevel:(index+1)*dimsperlevel] = maskedobs.data.reshape((dimsperlevel,)) mask[index*dimsperlevel:(index+1)*dimsperlevel] = maskedobs.mask.reshape((dimsperlevel,)) return vals[mask == False] # prints a given status message with a timestamp def report(status): print datetime.now().time().isoformat() + ":\t" + status # convenience function so can write to a compressed or uncompressed file transparently if compressed_flag: myopen = gzip.open else: myopen = open error_fh = open('grib_conversion_error_log_{0}_of_{1}'.format(myremainder, numprocesses), 'w') recordDateMapping = {} mappingfname = "CFSROcsv/recordDateMapping/part-" + format(myremainder, "05") if compressed_flag: mappingfname += ".gz" for (recordnum, inkey) in enumerate(keys): # only process records assigned to you if (recordnum % numprocesses) is not myremainder: continue recordDateMapping[recordnum] = inkey.name.split('.')[2] # choose the right name for the vector of observations and the vector of masks # depending on whether or not they're compressed valsfname = "CFSROcsv/vals/part-"+format(recordnum,"05") if compressed_flag: valsfname += ".gz" # avoid processing this set of observations if it has already been converted if output_to_S3_flag: possible_key = bucket.get_key(valsfname) if possible_key is not None: report("{0} already converted to csv, skipping record {1}".format(inkey.name, recordnum)) continue else: if hdfs.path.isfile(valsfname): report("{0} already converted to csv, skipping record {1}".format(inkey.name, recordnum)) continue # convert the observations and write them out to HDFS/S3 compressed/uncompressed try: grbs = get_grib_from_key(inkey) report("Retrieved {0} from S3".format(inkey.name)) observations = converttovec(grbs) report("Converted {0} to a numpy array of observations".format(inkey.name)) tempvalsfname = tempdir + '/tempvals{0}'.format(myremainder) with myopen(tempvalsfname, 'w') as valsfout: for index in range(0, observations.shape[0]): valsfout.write("{0},{1},{2}\n".format(recordnum, index, observations[index])) report("Wrote numpy array to a local file") if output_to_S3_flag: valsoutkey = Key(bucket) valsoutkey.key = valsfname valsoutkey.set_contents_from_filename(tempvalsfname) report("Wrote {0} to {1} on S3".format(inkey.name.split('.')[2], valsfname)) else: hdfs.put(tempvalsfname, valsfname) report("Wrote {0} to {1} on HDFS".format(inkey.name.split('.')[2], valsfname)) except: report("Skipping record {0}! An error occurred processing {1}".format(recordnum, inkey.name)) error_fh.write("Skipped {1}, record {0}\n".format(inkey.name, recordnum)) try: os.remove(tempgribfname) os.remove(tempvalsfname) # write the record mapping out to file so we know which rows correspond to which date temprecordfname = tempdir + '/temp{0}recordmapping'.format(myremainder) with myopen(temprecordfname, 'w') as fout: for recordnum, keyname in recordDateMapping.iteritems(): fout.write("{0},{1}\n".format(recordnum, keyname)) report("Wrote the observation date to row number mapping for process {0} to local file".format(myremainder)) if output_to_S3_flag: mappingoutkey = Key(bucket) mappingoutkey.key = mappingfname mappingoutkey.set_contents_from_filename(temprecordfname) report("Wrote record mapping for {0} to {1} on S3".format(myremainder, mappingfname)) else: hdfs.put(temprecordfname, mappingfname) report("Wrote record mapping for {0} to {1} on HDFS".format(myremainder, mappingfname)) os.remove(temprecordfname) except: report("Skipping writing the record mapping for {0}! An error occurred writing it out.".format(myremainder)) error_fh.write("Skipping writing the record mapping for {0}! An error occurred writing it out.\n".format(myremainder)) error_fh.close()