def read(fname): "Internal API to read data from HDFS files" # we need data file schema, later we need some proper default uri = os.getenv('WMA_SCHEMA', '/user/valya/test/fwjr_proc.avsc') schemaData = hdfs.load(uri) schema = avro.schema.parse(schemaData) out = [] data = hdfs.load(fname) bytes_reader = io.BytesIO(data) if fname.endswith('.gz'): # use gzip'ed reader and pass to it BytesIO as file object gzip_reader = gzip.GzipFile(fileobj=bytes_reader) decoder = avro.io.BinaryDecoder(gzip_reader) else: # use non-compressed reader decoder = avro.io.BinaryDecoder(bytes_reader) reader = avro.io.DatumReader(schema) while True: try: rec = reader.read(decoder) out.append(rec) except: break # close gzip stream if necessary if fname.endswith('.gz'): gzip_reader.close() # close bytes stream bytes_reader.close() return out
def __cp_file(self, wd): fn = "%s/fn" % wd hdfs.dump(self.data, fn, mode="wb") dest_dir = "%s/dest_dir" % wd hdfs.mkdir(dest_dir) fn_copy_on_wd = "%s/fn_copy" % wd hdfs.cp(fn, fn_copy_on_wd, mode="wb") self.assertEqual(hdfs.load(fn_copy_on_wd), self.data) self.assertRaises(IOError, hdfs.cp, fn, fn_copy_on_wd) fn_copy_on_dest_dir = "%s/fn" % dest_dir hdfs.cp(fn, dest_dir, mode="wb") self.assertEqual(hdfs.load(fn_copy_on_dest_dir), self.data) self.assertRaises(IOError, hdfs.cp, fn, dest_dir)
def read_avro(fname, schema): "Internal API to read data from HDFS files" out = [] data = hdfs.load(fname) bytes_reader = io.BytesIO(data) if fname.endswith('.gz'): # use gzip'ed reader and pass to it BytesIO as file object gzip_reader = gzip.GzipFile(fileobj=bytes_reader) decoder = avro.io.BinaryDecoder(gzip_reader) else: # use non-compressed reader decoder = avro.io.BinaryDecoder(bytes_reader) reader = avro.io.DatumReader(schema) while True: try: rec = reader.read(decoder) out.append(rec) except: break # close gzip stream if necessary if fname.endswith('.gz'): gzip_reader.close() # close bytes stream bytes_reader.close() return out
def read_job_conf(path: str) -> Dict[Any, Any]: """ The configuration file is passed as path on HopsFS The path is a JSON containing different values depending on the op type """ file_content = hdfs.load(path) return json.loads(file_content)
def _read(self, spec, fields=None): "Internal read API" if PAT_UID.match(str(spec)): # requested to read concrete file out = [] year, month, _ = today() hdir = '%s/%s/%s' % (self.hdir, year, month) fname = file_name(hdir, spec, self.compress) data = hdfs.load(fname) bytes_reader = io.BytesIO(data) if self.compress: # use gzip'ed reader and pass to it BytesIO as file object gzip_reader = gzip.GzipFile(fileobj=bytes_reader) decoder = avro.io.BinaryDecoder(gzip_reader) else: # use non-compressed reader decoder = avro.io.BinaryDecoder(bytes_reader) reader = avro.io.DatumReader(self.schema) while True: try: rec = reader.read(decoder) out.append(rec) except: break # close gzip stream if necessary if self.compress: gzip_reader.close() # close bytes stream bytes_reader.close() return out return self.empty_data
def read_schema(self, context): sfile = context.get_job_conf().get('avro.schema', None) try: schemaData = hdfs.load(sfile) except ValueError: # if not sfile: # else: sys.exit(1) self.schema = avro.schema.parse(schemaData)
def __cp_recursive(self, wd): src_t = self.__make_tree(wd) src = src_t.name copy_on_wd = "%s_copy" % src src_bn, copy_on_wd_bn = [ hdfs.path.basename(d) for d in (src, copy_on_wd) ] hdfs.cp(src, copy_on_wd) for t in src_t.walk(): copy_name = t.name.replace(src_bn, copy_on_wd_bn) self.assertTrue(hdfs.path.exists(copy_name)) if t.kind == 0: self.assertEqual(hdfs.load(copy_name), self.data) hdfs.cp(src, copy_on_wd) for t in src_t.walk(): copy_name = t.name.replace(src_bn, "%s/%s" % (copy_on_wd_bn, src_bn)) self.assertTrue(hdfs.path.exists(copy_name)) if t.kind == 0: self.assertEqual(hdfs.load(copy_name), self.data)
def load(hdfs_path): """ Read the content of hdfs_path and return it. Args: :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS). Returns: the read contents of hdfs_path """ hdfs_path = _expand_path(hdfs_path) return hdfs.load(hdfs_path)
def __init__(self, uri, compress=True): "ctor with hdfs uri: hdfsio:/path/schema.avsc" Storage.__init__(self, uri) schema = self.uri if not hdfs.ls(schema): raise Exception("No avro schema file found in provided uri: %s" % uri) self.hdir = self.uri.rsplit('/', 1)[0] if not hdfs.path.isdir(self.hdir): raise Exception('HDFS path %s does not exists' % self.hdir) schema_doc = hdfs.load(schema) self.schema = avro.schema.parse(schema_doc) self.compress = compress
def __cp_recursive(self, wd): src_t = self.__make_tree(wd) src = src_t.name copy_on_wd = "%s_copy" % src src_bn, copy_on_wd_bn = [ hdfs.path.basename(d) for d in (src, copy_on_wd) ] hdfs.cp(src, copy_on_wd, mode="wb") exp_t = self.__make_tree(wd, root=copy_on_wd_bn, create=False) for t, exp_t in czip(src_t.walk(), exp_t.walk()): self.assertTrue(hdfs.path.exists(exp_t.name)) if t.kind == 0: self.assertEqual(hdfs.load(exp_t.name), self.data) # check semantics when target dir already exists hdfs.rmr(copy_on_wd) hdfs.mkdir(copy_on_wd) hdfs.cp(src, copy_on_wd, mode="wb") exp_t = self.__make_tree(copy_on_wd, root=src_bn, create=False) for t, exp_t in czip(src_t.walk(), exp_t.walk()): self.assertTrue(hdfs.path.exists(exp_t.name)) if t.kind == 0: self.assertEqual(hdfs.load(exp_t.name), self.data)
def wordcount(): f = request.files['file'] f.save( os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(f.filename))) with open(f.filename, 'r') as fopen: hdfs.dump(fopen.read(), '/user/input_wordcount/text') os.system( 'pydoop script -c combiner wordcount.py /user/input_wordcount /user/output_wordcount' ) list_files = hdfs.hdfs().list_directory('/user/output_wordcount') return json.dumps([ hdfs.load(file['name'], mode='rt') for file in list_files if 'SUCCESS' not in file['name'] ])
def lowercase(): f = request.files['file'] f.save( os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(f.filename))) with open(f.filename, 'r') as fopen: hdfs.dump(fopen.read(), '/user/input_lowercase/text') os.system( "pydoop script --num-reducers 0 -t '' lowercase.py /user/input_lowercase /user/output_lowercase" ) list_files = hdfs.hdfs().list_directory('/user/output_lowercase') return json.dumps([ hdfs.load(file['name'], mode='rt') for file in list_files if 'SUCCESS' not in file['name'] ])
def __init__(self, file_prefix, loadexist=False, readonly=False): CustomStorage.__init__(self) if not loadexist: if hdfs.path.exists('{0}_0'.format(file_prefix)): file_prefix += '_0' while hdfs.path.exists('{0}_0'.format(file_prefix)): insert_index = file_prefix.rfind('_') file_prefix = '{0}_{1}'.format(file_prefix[:insert_index], int(file_prefix[insert_index + 1:]) + 1) self.file_prefix = file_prefix self.read_only = readonly self.clear() logger.info('init hdfs storage from hdfs file_prefix {0}'.format(self.file_prefix)) try: total_start = timeit.default_timer() prefix_split = hdfs.path.splitpath(self.file_prefix) folder_path = prefix_split[0] real_prefix = prefix_split[1] + '_' if not hdfs.path.exists(folder_path): hdfs.mkdir(folder_path) files_info = hdfs.lsl(folder_path) # files_info = hdfs.lsl('{0}_*'.format(self.file_prefix)) logger.debug('files_info:{0}'.format(files_info)) sizecount = 0 for file_info in files_info: start_time = timeit.default_timer() file_name = hdfs.path.splitpath(file_info['path'])[1] if file_name.startswith(real_prefix) and file_info['kind'] == 'file': logger.debug('file info: {0}'.format(file_info)) page_id = file_name[len(real_prefix):] if not page_id.isdigit(): continue logger.debug('file {0} page id :{1}#'.format(file_info['path'], page_id)) # if page_id.isdigit(): logger.info('load {0}# page file {1}'.format(page_id, file_info['path'])) content = hdfs.load(file_info['path'], mode='r') # logger.debug('{0}# page content:{1}'.format(page_id, content)) self.pagedict[int(page_id)] = content logger.debug('{0}# page load complete'.format(page_id)) end_time = timeit.default_timer() eval(generate_timer_log_str.format( 'load {0} {1} byte'.format(file_name, len(self.pagedict[int(page_id)])), start_time, end_time)) sizecount += len(self.pagedict[int(page_id)]) except IOError, ie: logger.debug(traceback.format_exc())
def __init__(self, uri, wmauri, yarn=''): "ctor with LTS uri (hdfs:///path/schema.avsc) and WMArchive uri" self.uri = uri if not hdfs.ls(self.uri): raise Exception("No avro schema file found in provided uri: %s" % uri) self.hdir = self.uri.rsplit('/', 1)[0] if not hdfs.path.isdir(self.hdir): raise Exception('HDFS path %s does not exists' % self.hdir) schema_doc = hdfs.load(self.uri) self.schema = avro.schema.parse(schema_doc) self.taskmgr = TaskManager() self.wmauri = wmauri # WMArchive URL which will be used by submit if not self.wmauri.endswith('/wmarchive/data'): self.wmauri = '%s/wmarchive/data' % self.wmauri self.yarn = yarn
import sys import pydoop.hdfs as hdfs #create package #date = 'data/'+str(sys.argv[1])[2:]+'/' st = '[' for x in hdfs.ls("data/18-02-21/"): #for x in hdfs.ls("date"): st = st + hdfs.load(x) st = st.replace("\n", ",") st = st[:-1] st = st + ']' #string4 = '{"input":"'+str(sys.argv[1])+'"}' hdfs.dump(st, "test/hello.txt") hdfs.get("test/hello.txt", "/tmp/tmp.txt")
import pydoop.hdfs as hdfs hdfs.mkdir("test") hdfs.dump("hello hadoop", "test/hello.txt") text = hdfs.load("test/hello.txt") print(text)
def get_data_from_hdfs(self,file_location_hdfs): if (self.file_exist(file_location_hdfs))==False: return -1 data=hdfs.load(file_location_hdfs) return data
def load(self): for test_path in self.hdfs_paths[0], self.local_paths[0]: hdfs.dump(self.data, test_path, mode="wb") rdata = hdfs.load(test_path) self.assertEqual(rdata, self.data)
import pydoop import pydoop.hdfs as hdfs if __name__ == "__main__": print hdfs.ls("/user") print hdfs.ls(".") files = hdfs.ls(".") text = hdfs.load(files[0]) print text[0:20] print hdfs.path.isdir("/user") basename = hdfs.path.basename(files[0]) #hdfs.get(basename,"/tmp/"+basename) #with open("/tmp/"+basename) as f: # print f.read() #hdfs.put("/tmp/"+basename, basename+".copy") #print hdfs.load(basename+".copy") print hdfs.ls(".")
def main(input_path, output_attribute_index, scikit_output_path, spark_output_path): # Instancira se Passive Aggressive Regressor model regressor = PassiveAggressiveRegressor() for file_path in hdfs.ls(input_path): # Ucitava se sadrzaj fajla i kreira string matrica od njega content = hdfs.load(file_path) temp = content.split("\n") temp = list(map(lambda x: x.split(","), temp)) temp = list(filter(lambda x: len(x) > 1, temp)) raw_matrix = np.array(temp) # Ucitava se numpy matrica i zatim parsira u matricu realnih vrednosti # koja se nakon toga koristi prilikom treniranja modela # raw_matrix = np.genfromtxt(file_path, delimiter=',', dtype='string') input_matrix = raw_matrix[1:, 3:-5].astype('float64') output_vector = raw_matrix[1:, -5 + output_attribute_index].astype('float64') # Model se trenira u vidu iterativnog poboljsanja regressor.partial_fit(input_matrix, output_vector) # Na konzoli se stampa putanja do obradjenog fajla print(file_path) # Cuva se kreirani model na izlaznoj putanji # koja je prosledjena u vidu argumenta with hdfs.open(scikit_output_path, 'w') as opened_file: pickle.dump(regressor, opened_file) # Inicijalizacija konfiguracije i konteksta izvrsenja aplikacije configuration = SparkConf().setAppName("BigDataProj3_Trainer") context = SparkContext(conf=configuration) context.setLogLevel("ERROR") # Inicijalizacija sesije # (mora da se obavi zbog upisivanja modela) session = SparkSession(context) # Ucitavanje RDD podataka sa ulazne putanje input_data = context.textFile(input_path) # Parsiranje svakog reda na reci input_data = input_data.map(lambda x: x.split(",")) # Ignorisu se header-i input_data = input_data.filter(lambda x: x[0] != "Timestamp") # Ignorisu se prve tri vrste (Timestamp, Latitude i Longitude) # i bira se odgovarajuca izlazna kolona # (u zavisnosti od output_attribute_index promenljive) input_data = input_data.map(lambda x: list(map(lambda y: float(y), x[ 3:-5])) + [float(x[-5 + output_attribute_index])]) # Formira se odgovarajuci DataFrame objekat # (VectorAssembler se koristi kod formiranja kolona # koje omogucavaju koriscenje fit metode linearne regresije) input_cols = [] for i in range(15): input_cols.append("_" + str(i + 1)) assembler = VectorAssembler(inputCols=input_cols, outputCol='features') data_frame = assembler.transform(input_data.toDF()) # Instancira se LinearRegression objekat i vrsi njegovo treniranje # i zatim cuvanje na zadatoj putanji regression = LinearRegression(featuresCol='features', labelCol='_16') model = regression.fit(data_frame) model.write().overwrite().save(spark_output_path)