def load_from_hdfs(data_package, file_name): #def load_from_hdfs(data_package, file_name='CThead_uchar.raw'): hdfs_str = data_package.stream_hdfs_file_name hdfs_addr = hdfs_str[:hdfs_str.rfind('0/')+1] hdfs_path = hdfs_str[hdfs_str.rfind('0/')+2:] if log_type in ['time','all']: st = time.time() client = InsecureClient(hdfs_addr, user=getpass.getuser()) with client.read('%s/%s'%(hdfs_path, file_name)) as reader: data = numpy.array(Image.open(StringIO(reader.read()))) print_purple("LOADED") return data
def crop(): # Check pictures folders if request.args.get('from') is None: return 'No "from" directory given.' if request.args.get('to') is None: return 'No "to" directory given.' directory_from = request.args.get('from') directory_to = request.args.get('to') dask_client = Client('192.168.1.4:8786') hdfs_client = InsecureClient('http://192.168.1.4:9870', user='******') with hdfs_client.read('/' + directory_from + 'data.csv') as reader: data = pd.read_csv(reader) data = dd.from_pandas(data, npartitions=24) data.map_partitions(compute_crop, directory_from, directory_to, meta='dask.dataframe.core.Series').compute() create_csv(directory_to=directory_to) return "Crop finished"
class HdfsWrapper: def __init__(self): self.client = None def connect_hdfs(self): self.client = InsecureClient(CONST.HDFS_URL, user=CONST.HDFS_USER) def mkdir_hdfs(self, path): if not exists(path): self.client.makedirs(path) def list_hdfs(self, path): return self.client.list(path) def read_hdfs(self, hdfs_path): try: with self.client.read(hdfs_path) as reader: return reader.read() except: log.error(traceback.format_exc()) self.connect_hdfs() log.error('reconnect hdfs...') def write_hdfs(self, hdfs_path, data, overwrite=False): try: with self.client.write(hdfs_path, overwrite=overwrite) as writer: writer.write(data) return hdfs_path except: log.error(traceback.format_exc()) self.connect_hdfs() log.error('reconnect hdfs...') def delete_hdfs(self, hdfs_path, recursive=False): return self.client.delete(hdfs_path, recursive)
def read_hdfs(filename, root_dir='data'): data_dir = os.path.join(root_dir, filename) client_hdfs = InsecureClient('http://' + os.environ['IP_HDFS'] + ':50070') with client_hdfs.read(data_dir, encoding='latin-1') as reader: df = pd.read_csv(reader, index_col=0) return df
def load_from_hdfs(data_package, hdfs_addr, hdfs_path): #def load_from_hdfs(data_package, file_name='CThead_uchar.raw'): if log_type in ['time','all']: st = time.time() dp = data_package ds = dp.data_range ds_seq = [ds[elem][1]-ds[elem][0] for elem in ['z', 'y', 'x'] if elem in ds] while True: try: client = InsecureClient(hdfs_addr, user=getpass.getuser()) file_python_dtype = Vivaldi_dtype_to_python_dtype(dp.file_dtype) file_bytes = get_bytes(file_python_dtype) #print "START TO CONNECT HDFS" bef = time.time() with client.read(hdfs_path, offset=(ds_seq[1]*ds_seq[2]*ds['z'][0]*file_bytes),length=ds_seq[0]*ds_seq[1]*ds_seq[2]*file_bytes) as reader: buf = reader.read() aft = time.time() diff = aft - bef print_bold( "DATA LOADING ENDS from %s -- time elapsed = %.03f (sec) , reading speed = %.03f MB/sec"%(socket.gethostname(), diff, len(buf) / diff * (1024 ** -2))) data = numpy.fromstring(buf, dtype=file_python_dtype).reshape(ds_seq) break except: print bcolors.WARNING + "Connection Broken" + bcolors.ENDC return data
def normalize(): # Check pictures folders if request.args.get('from') is None: return 'No "from" directory given.' if request.args.get('to') is None: return 'No "to" directory given.' dask_client = Client('192.168.1.4:8786') hdfs_client = InsecureClient('http://192.168.1.4:9870', user='******') from_directory = request.args.get('from') to_directory = request.args.get('to') with hdfs_client.read('/' + from_directory + 'data.csv') as reader: data = pd.read_csv(reader) data = dd.from_pandas(data, npartitions=24) data.map_partitions(compute_norm, from_directory, to_directory, meta='dask.dataframe.core.Series').compute() create_csv(directory_to=to_directory) return 'Normalization done.'
class HDFSService(object): def __init__(self): self.hdfs = InsecureClient('http://127.0.0.1:9870', user='******') self.base_path = '/users/root' def mkdir(self, path): return self.hdfs.makedirs(path) def list(self, path): try: return self.hdfs.list(path) except HdfsError as e: print(e) return [] def get(self, path): pass def upload(self, path, local_path=None, data=None): path = self.base_path + path if data is not None: return self.hdfs.write(path, data=data) elif local_path is not None: return self.hdfs.upload(path, local_path) return False pass def download(self, path): path = self.base_path + path with self.hdfs.read(path) as reader: print(path) buf = reader.read() print(len(buf)) return buf
def GetSentenceVectorsFromHDFS(file_path, hdfs_url, user): hdfs_client = InsecureClient(hdfs_url, user=user) with hdfs_client.read(file_path) as reader: df = pd.read_csv(reader) # 文字記錄的向量轉換為以numpy紀錄 df['vector_'] = df['vector'].map(lambda x: np.asarray( x.replace("]", '').replace("[", '').split(' ')).astype(np.float)) return df return
class Prediction_ML(): def __init__(self, dir_algo, algo, path_img): logging.info('prediction_ML.init') self.directory_algo = dir_algo self.path_img = path_img self.algo = algo self.hdfs_client = InsecureClient('http://192.168.1.4:9870', user='******') self.image = self.read_image(self.path_img, 240) def read_image(self, path_img, img_size=0): logging.info('prediction_ML.read_image') img = 0 try: with self.hdfs_client.read(path_img) as reader: img = Image.open(reader) if img_size != 0: img = img.resize((img_size, img_size)) img = img.convert('L').convert('RGB') img = np.asarray(img).flatten() except IOError as err: logging.error("Error reading image or path") logging.error(err) except Exception as err: logging.error("Unkownown error in read_image") logging.error(err) return img def run(self): try: self.hdfs_client.download( self.directory_algo + self.algo + ".model", self.algo + ".model") model = joblib.load(self.algo + ".model") os.remove(self.algo + ".model") label = model.predict([self.image]) try: array_proba = model.predict_proba([self.image])[0] proba = array_proba[label[0]] except: proba = -1 return label[0], proba except IOError as err: logging.error('Error model ' + str(self.algo) + ' is not trained yet!') logging.error( 'Train this model first before using it for predictions') return -1, 1
def main(): # Connecting to HDFS client = InsecureClient(hdfsServer, user='******') # Downloading the list of most popular words with client.read('/tmp/word_count_100k.csv', encoding='UTF-8') as csvfile: w = csv.DictReader(csvfile) word_count_aux = list(w)[0] # Selecting the 2000 most popular words word_count_dict = {key:int(value) for key,value in word_count_aux.items()} word_count = collections.Counter(word_count_dict) top_words = [word for (word, _) in word_count.most_common(2000)] # Downloading the trained Logistic Regression model with client.read('/tmp/twitterML.model') as modelfile: logmodel = pickle.load(modelfile) # Starting Spark context and streaming sc = SparkContext(appName="StreamingKafkaTweetProcessor") sc.setLogLevel("WARN") ssc = StreamingContext(sc, 1) ssc.checkpoint("/tmp/checkpoint") # Configuring Spark Streaming with a Kafka Consumer using a JSON deserializer kafkaStream = KafkaUtils.createStream(ssc, zookeeperServer, 'spark-group', {'twitter':1}, valueDecoder=lambda m: json.loads(m.decode('UTF-8'))) # Extracting the data field tweets = kafkaStream.map(lambda v: v[1]) # Analysing the sentiment of each Tweet sentiment_tweets = tweets.map(lambda tweet: tweet_sentiment(tweet, logmodel, top_words)) # Printing 10 Tweets with the sentiment each second sentiment_tweets.pprint(10) # Sending blocks of Tweets to the function responsible to send the to HBase sentiment_tweets.foreachRDD(lambda rdd: rdd.foreachPartition(sendToHbase)) ssc.start() ssc.awaitTermination()
def read_by_small(): client = InsecureClient(HDFS_URL, user=HDFS_USERNAME) files_list = client.list(HDFS_DIR) images = [] for fn in files_list: with client.read(hdfs_path=os.path.join(HDFS_DIR, fn)) as reader: img = reader.read() images.append(img) print(len(img))
class HdfsDb(object): HOST = '192.168.71.156' PORT = 50070 USER = '******' HOST_URI = 'http://{0}:{1}'.format(HOST, PORT) def __init__(self): self.client = InsecureClient(self.HOST_URI, user=self.USER) @check_dir_path def list_dir(self, dir_path=None): """ 列出根目录 :return: """ dir_data = self.client.list(dir_path) return dir_data @check_dir_path def mk_dir(self, dir_path=None): self.client.makedirs(dir_path) def write_file(self, filename, data, dir_path=None): """ 写入文件 hd.write_file('test.json', {'name': 'zhexiao'}, dir_path='/data') :param filename: :param data: :param dir_path: :return: """ file_path = '{0}/{1}'.format(dir_path, filename) self.client.write(file_path, str(data)) @check_dir_path def read_file(self, filename, dir_path=None): """ 读取文件数据 filedata = hd.read_file('README.txt', dir_path='/data') :param filename: :param dir_path: :return: """ file_path = '{0}/{1}'.format(dir_path, filename) with self.client.read(file_path, encoding='utf-8') as reader: for line in reader: yield line @check_dir_path def delete(self, filename, dir_path=None): file_path = '{0}/{1}'.format(dir_path, filename) self.client.delete(file_path)
def download(keyword): client = InsecureClient("http://ip_address", user="******") root_dir = "/username/dps" for folder in client.list(root_dir): if keyword not in folder: continue os.makedirs(os.path.join("data", folder), exist_ok=True) for file in client.list(root_dir + "/" + folder): target_path = os.path.join("data", folder, file) logging.info("Downloading for {}".format(target_path)) if os.path.exists(target_path): logging.warning("{} already exists!".format(target_path)) continue with open(target_path, "wb") as writer, client.read("{}/{}/{}".format(root_dir, folder, file)) as reader: writer.write(reader.read())
def get_stopwords_from_hdfs(self): stopwords_set = set() print "loading stopwords..." try: client = InsecureClient(self.HDFS_ADDR, user='******') with client.read(self.STOPWORDS_PATH, encoding="utf-8", delimiter="\n") as reader: for _ in reader: stopwords_set.add(_.encode('utf-8')) print "done!!" except NameError as n: print "hdfs取得數據(停用詞)失敗", n return stopwords_set
def reducer(): web_hdfs_interface = InsecureClient('http://localhost:9870', user='') with web_hdfs_interface.read( '/tic_tac_toe/ml-project/x_mapped.csv') as reader: to_reduce = open('x_mapped.csv', newline='\n') currentSquare = None currendResult = None reduced = {} print('REDUSING...') for line in to_reduce: #split the tuple square, result = line.strip().split('\t') #if dict key does not exist init if not reduced.get(square): reduced[square] = {'W': 0, 'L': 0} #if square change if currentSquare != square: #if not first loop print reduced if currentSquare is not None: print( currentSquare, reduced.get(currentSquare), 'W/L Ratio', float(reduced.get(currentSquare)['W']) / float(reduced.get(currentSquare)['L'])) #change cur square currentSquare = square #if result also changed if currendResult != result: #change result currendResult = result reduced.get(square)[result] += 1 else: reduced.get(square)[result] += 1 else: if currendResult != result: currendResult = result reduced.get(square)[result] += 1 else: reduced.get(square)[result] += 1 #print last print( currentSquare, reduced.get(currentSquare), 'W/L Ratio', float(reduced.get(currentSquare)['W']) / float(reduced.get(currentSquare)['L']))
def GetMetaDataFromHDFS(mata_data_path, hdfs_url, user): vect = None names = set() def remove_last_comma(line): return line[0:len(lines[1]) - 2] hdfs_client = InsecureClient(hdfs_url, user=user) with hdfs_client.read(mata_data_path) as reader: lines = reader.readlines() vect = np.asarray(remove_last_comma(lines[1]).split(',')).astype( np.float) for name in remove_last_comma(lines[3]).split(','): if (name == ''): continue names.add(int(name)) return vect, names
def search_img(src, key_value): data = [] hdfs = InsecureClient(current_app.config['WEBHDFS_ADDR'], user=current_app.config['WEBHDFS_USER']) with hdfs.read(src) as reader: data = reader.read() nparr = numpy.fromstring(data, numpy.uint8) img1 = cv2.imdecode(nparr, cv2.IMREAD_COLOR) # cv2.IMREAD_COLOR in OpenCV 3.1 orb = cv2.ORB_create() # find the keypoints and descriptors with SIFT kp1, desc = orb.detectAndCompute(img1, None) m = {} for key in key_value: m[key] = search_img_desc(orb, desc, key_value[key]) return m
def get_vec_from_hdfs(self): word_vec_dict = dict() print "loading word vectors..." try: client = InsecureClient(self.HDFS_ADDR, user='******') # delimiter="\n" 讀取分隔符為空格 with client.read(self.VEC_PATH, encoding="utf-8", delimiter="\n") as reader: for _ in reader: _ = _.split(' ') key = _[0].encode('utf-8') value = np.array(_[1:len(_) - 1]).astype(np.float) word_vec_dict[key] = value print "done!!" except NameError as n: print "hdfs取得數據(詞向量)失敗", n return word_vec_dict
def mapper(): print("MAPPER READS...") web_hdfs_interface = InsecureClient('http://localhost:9870', user='') with web_hdfs_interface.read( '/tic_tac_toe/ml-project/tic-tac-toe.data') as reader: data = pd.read_csv(reader) to_reduce = [] for index, row in data.iterrows(): #for every column except class for square in data.columns[:-1]: #instead of 1 or 0 we have W and L if row.loc[square] == 'x' and row.loc['Class'] == 'positive': print(square, 'W') to_reduce.append('%s\t%s' % (square, 'W')) elif row.loc[square] == 'x' and row.loc['Class'] == 'negative': print(square, 'L') to_reduce.append('%s\t%s' % (square, 'L')) to_reduce.sort() with open('x_mapped.csv', 'w') as csvfile: for line in to_reduce: csvfile.write(line + '\n')
def read_by_bulk(index_fp): with open(index_fp) as f: file_index_list = f.readline().split(",") filename_list = f.readline().split(",") file_index_list[-1] = file_index_list[-1].replace("\n", "") file_index_list = [int(item) for item in file_index_list] client = InsecureClient(HDFS_URL, user=HDFS_USERNAME) with client.read(hdfs_path='./bulk_img.tiff') as reader: bulk = reader.read() images = [] for i in range(len(file_index_list) - 1): img = bulk[file_index_list[i]:file_index_list[i + 1]] images.append(images) img = bulk[file_index_list[-1]:] images.append(img) print('Total {} images'.format(len(images))) return images
class interHDFS: def __init__(self, url, user=None, **kwargs): self.url = url self.user = user for k, v in kwargs.items(): self.k = v self.connect = InsecureClient(self.url, self.user) try: self.connect.status('/') except Exception as e: print(f"[ERROR]:") raise ("connected failed!") @property def apiVersion(self): return "v1" def listDir(self, dirname: str = '/'): return self.connect.list(dirname) def getFiles(self, dirname: str, depth: int = 0) -> list: l = [] if not dirname: print("dirname is null") else: for file in self.connect.walk(dirname, depth=depth): if file[-1]: for f in file[-1]: l.append(file[0] + '/' + f) return l def downloadToCsv(self, filename: str) -> None: '''only split for the '€€' sign, and generate same filename in current directory''' with self.connect.read(filename, encoding='utf-8') as reader: with open(csvdir + filename.split('/')[-1].split('.')[0] + '.csv', 'a+') as cf: for line in reader.readlines(): newline = line.replace('€€', ',') cf.write(newline)
def run_test(mode): client = InsecureClient('http://juneau:46731', user='******') # HDFS Web UI port!! with client.read("/pubg/aggregate/agg_match_stats_0.csv") as f: df = pd.read_csv(f, usecols=[1, 3, 4, 9, 12], nrows=50000).replace(to_replace={ 'tpp': 2, 'fpp': 1 }, value=None) #df = pd.read_csv('agg_match_stats_0.csv', usecols=[1, 3, 4, 9, 12], nrows=50000).replace(to_replace={'tpp': 2, 'fpp': 1}, value=None) if mode == 1: X = df[df['match_mode'] == 1].drop( columns=['match_mode']).values.astype('double') T = df[df['match_mode'] == 1].iloc[:, 4:].values.astype('double').reshape( -1, 1) if mode == 2: X = df[df['match_mode'] == 2].drop( columns=['match_mode']).values.astype('double') T = df[df['match_mode'] == 2].iloc[:, 4:].values.astype('double').reshape( -1, 1) network = [5] relu = True model = nn.NN_distributed(X.shape[1], network, T.shape[1], relu) if mode == 1: model.load_state_dict(torch.load('Best network (FPP).pth')) if mode == 2: model.load_state_dict(torch.load('Best network (TPP).pth')) Y = model.use_pytorch(X) RMSE_model = np.sqrt(np.mean((Y - T)**2)) print(f'Best Network Test RMSE: {RMSE_model}') for i in range(1000, 5000, 500): print( f'Sample Target {i}: {T[i][0]}, Predicted Value: {model.use_pytorch(X[i])[0]}' )
class HDFS(BaseRepository): def __init__(self, host: str, port, user: str): super().__init__() self.host = host self.port = port self.user = user self.prodcuer = None def connect(self): self.conn = InsecureClient(f"http://{self.host}:{self.port}", user=self.user) if os.environ.get("KAFKA_BOOTSTRAP", None): self.producer = KafkaProducer(bootstrap_servers=os.environ.get( "KAFAKA_BOOTSTRAP", "localhost:1234")) else: self.producer = None def disconnect(self): self.save_snapshot() if self.prodcuer: self.producer.close() def insert_rows(self, rows: list[(datetime, str, str, str, str, str)]): self.add_buff(rows) self.flush() def _last_datetime(self, category, date): if self.conn.status(f"/krwordcloud/add-article/{date}")['length'] == 0: return config.min_date tfname = '' with tempfile.NamedTemporaryFile("wb") as tf: tfname = tf.name with self.conn.read(f"/krwordcloud/add-article/{date}", chunk_size=8096) as hf: for chunk in hf: tf.write(chunk) with open(tfname, 'rb') as tf: reader = pyorc.Reader(tf) maximum = datetime.datetime \ .strptime(f"{date} GMT+0900", "%Y-%m-%d.orc GMT%z") for row in reader: if row[0] > maximum and row[1] == category: maximum = row[0] if (maximum < config.min_date): return config.min_date elif maximum > datetime.datetime.now().replace(tzinfo=KST): return datetime.datetime.now().replace(tzinfo=KST) else: return maximum os.unlink(tfname) def make_entries(self): entries = dict() hdfs_entries = dict() lookup_hdfs = [] self.load_snapshot() for category in config.categories: category_rows = list( filter(lambda row: row[1] == category, self.buff)) if len(category_rows) > 0: last = max(category_rows, key=lambda row: row[0]) entries[category] = last[0] else: lookup_hdfs.append(category) try: dates = self.conn.list("/krwordcloud/add-article/") if len(dates) > 0: for category in lookup_hdfs: found = False for last in reversed(dates): try: entries[category] = self._last_datetime( category, last) found = True break except Exception as e: print(e) continue if found is False: entries[category] = config.min_date else: hdfs_entries = dict.fromkeys(lookup_hdfs, config.min_date) except HdfsError: entries[category] = config.min_date except Exception as e: print(e) return { k: v for k, v in sorted({ **entries, **hdfs_entries }.items(), key=lambda item: item[1]) } def save_snapshot(self): print('save_snapshot') with self.conn.write("/krwordcloud/snapshot.json", overwrite=True, encoding="utf-8") as f: data = list( map(lambda x: (x[0].isoformat(), x[1], x[2], x[3], x[4], x[5]), self.buff)) json.dump(data, f, ensure_ascii=False) def load_snapshot(self): print('load_snapshot') try: with self.conn.read("/krwordcloud/snapshot.json", encoding="utf-8") as f: self.buff = list( map( lambda x: (parser.parse(x[0]), x[1], x[2], x[3], x[4], x[5]), json.load(f))) except Exception: self.buff = [] def flush(self): dates = sorted(list(set(map(lambda row: row[0].date(), self.buff)))) if len(dates) > 1: for d in dates[:-1]: data = list(filter(lambda row: row[0].date() == d, self.buff)) if self.producer: self._kafka_flush(d, data) else: self._hdfs_flush(d, data) self.buff = list( filter(lambda row: row[0].date() == dates[-1], self.buff)) self.save_snapshot() def _kafka_flush(self, date, data): self.producer.send(f"add-article-{date}", data) def _hdfs_flush(self, date, data): with self.conn.write(f"/krwordcloud/add-article/{date}.orc", overwrite=True) as hf: tfname = '' with tempfile.NamedTemporaryFile(mode="wb+", delete=False) as tf: tfname = tf.name with pyorc.Writer( tf, schema="struct<field0:timestamp,field1:string," + "field2:string,field3:string>", ) as of: of.writerows(data) with open(tfname, 'rb') as tf: for line in tf: hf.write(line) os.unlink(tfname)
TOPIC = os.environ["TOPIC"] HDFS_NAMENODE = os.environ["HDFS_HOSTNAME"] client_hdfs = InsecureClient(HDFS_NAMENODE) while True: try: producer = KafkaProducer(bootstrap_servers=KAFKA_BROKER.split(",")) print("Producer: Connected to Kafka!") break except kafka.errors.NoBrokersAvailable as e: print(e) time.sleep(3) file_name = "/produce/file/streaming.csv" with client_hdfs.read(file_name, encoding='utf-8') as reader: df = pd.read_csv(reader, low_memory=False) df.info(verbose=False) df = df[df['PtID'].notna()] df['PtID'] = df['PtID'].astype(int) df['Year'] = df['Year'].astype(int) df['Month'] = df['Month'].astype(int) df['Day'] = df['Day'].astype(int) print(df.head(10)) df = df.sort_values(by=['Date', 'Time']) for i, row in df.iterrows(): print(row.to_json()) producer.send(TOPIC, key=bytes(str(row['RecID']), 'utf-8'), value=bytes(row.to_json(), 'utf-8')) if i % 132 == 0:
from hdfs import InsecureClient # log in hdfs server client = InsecureClient('http://master32:50070', user='******') # print all of the hdfs root folder print client.list('/') path = '/test/aaa.txt' # Check if the file exists if (client.content(path, strict=False) != None): client.delete(path) print "START TO WRITE FILE" # write a text file from hdfs with client.write(path, encoding='utf-8') as writer: for i in range(10): writer.write("Hello World\n") print "DONE" print "START TO READ FILE" # read a text file from hdfs with client.read(path, chunk_size=8096) as reader: for chunk in reader: print chunk print "DONE"
missing_data = 0 #counter to see how many records in the json file are with empty data. #another method to get data from hdfs. Can be used for GPU processing # a=datetime.now() # date=str(a.year)+str(a.month).zfill(2)+str(a.day).zfill(2) # (ret, out, err)= run_cmd(['hadoop', 'fs', '-get', '/data/atl_sprint_2018/lexis_archive/lexis_%sT0000.json' # %(date), './SocialMediaSprint/']) # if ret == 0: #if return code is 0, file exists. # with open('lexis_%sT0000.json' %(date), 'r') as file1: # #Today’s file not found - If number of days between current date and last day’s file is > 0, # #then today’s file is not found. Runs after 2 hours. if delta.days == 0: #delta days = 0 ==> the last file was created today. Yay, we have data! #read the file from hdfs with client.read('/data/atl_sprint_2018/lexis_archive/' + fjson, encoding='utf-8', delimiter='\n') as file1: for line in file1: #each line is a json object (dictionary) try: news_article = json.loads(line) if news_article['Text'] != 'None' and len( news_article['Text'].split(' ') ) >= 100: #consider the data which has more than 50 words all_articles.append( news_article ) #append individual news articles to the data list. else: missing_data += 1 except: continue #If there is data, processes the data and writes data into sentiment and topic files.
from hdfs import InsecureClient from joblib import load from io import BytesIO client_hdfs = InsecureClient('http://localhost:9870', user='******') path = '/home/hadoop/hdfs/test/xyBernoulliNB()-1.json' with client_hdfs.read(path) as reader: model = load(BytesIO(reader.read())) print(model)
# 1번 작업 from hdfs import InsecureClient import pandas as pd import matplotlib.pyplot as plt import numpy as np # 문자열을 input output 파일 처럼 받을 수 있다. from io import StringIO client = InsecureClient("http://192.168.56.100:50070", user="******") # 접속 확인 # print(client) # 데이터 읽어오기 with client.read("output/dept_delay_count/part-r-00000", encoding="utf-8") as reader: data = reader.read() # print(data) # 현재 data는 파일이 아니다 # data -> str -> stream stream = StringIO(data) # print(stream) df = pd.read_csv(stream, sep="\t", header=None) # print(df) # 년과 월을 분리 # print(df[0].str.split(",")) df['year'] = df[0].str.split(",").str[0] df['month'] = df[0].str.split(",").str[1] # print(df)
# Install hdfs package via pip import pandas as pd from hdfs import InsecureClient import os # ===== Connect to HDFS ===== #client_hdfs = InsecureClient('hdfs_adress:web_port') client_hdfs = InsecureClient('http://hadoop01.org:50070') # ===== Read File in HDFS ===== with client_hdfs.read('hdfs_path_file', encoding='utf-8') as reader: df = pd.read_csv(reader, index_col=0) print(df) # ==== Creating a simple Pandas DataFrame ===== liste_hello = ['hello1', 'hello2'] liste_world = ['world1', 'world2'] df = pd.DataFrame(data={'hello': liste_hello, 'world': liste_world}) # ==== Writing Dataframe to HDFS ===== with client_hdfs.write('/user/hdfs/wiki/helloworld.csv', encoding='utf-8') as writer: df.to_csv(writer) # ====== Reading files ====== with client_hdfs.read('/user/hdfs/wiki/helloworld.csv', encoding='utf-8') as reader: df = pd.read_csv(reader, index_col=0) # ==== Getting Content Summary ====
class MachineLearning(): # reads images and stores them def __init__(self, input_folder, model_folder, img_size=240): self.input_folder = input_folder self.model_folder = model_folder self.hdfs_client = InsecureClient('http://192.168.1.4:9870', user='******') self.imgs, self.labels = self.read_images(input_folder, 240) self.default = "svm" # reads images from a directory and resizes them # returns the list of images and list of labels def read_images(self, directory, img_size=0): list_img = [] labels = [] logging.info('read_images') try: for name in self.hdfs_client.list('/' + directory + 'yes'): if name == "Thumbs.db": continue with self.hdfs_client.read('/' + directory + 'yes/' + name) as reader: img = Image.open(reader) if img_size != 0: img = img.resize((img_size, img_size)) img = img.convert('L').convert('RGB') list_img.append(np.asarray(img).flatten()) labels.append(1) for name in self.hdfs_client.list('/' + directory + 'no'): if name == "Thumbs.db": continue with self.hdfs_client.read('/' + directory + 'no/' + name) as reader: img = Image.open(reader) if img_size != 0: img = img.resize((img_size, img_size)) img = img.convert('L').convert('RGB') list_img.append(np.asarray(img).flatten()) labels.append(0) except Exception as err: logging.error("Error in read_images") logging.error(err) list_img = [] labels = [] logging.info("Finished reading images") return list_img, labels # returns the untrained model for a given algorithm def get_model(self, algorithm, params): if (algorithm == "knn"): return KNeighborsClassifier(**params, n_jobs=-1) elif (algorithm == "svm"): return SVC(**params, gamma='auto', random_state=0, probability=True) elif (algorithm == "gbc"): return GradientBoostingClassifier(**params) elif (algorithm == "rfc"): return RandomForestClassifier(**params, n_estimators=500) elif (algorithm == "nn"): return neural_network.MLPClassifier(**params) else: return self.get_model(self.default, params) # returns a set of the "best" parameters for a given algorithm def get_params(self, algorithm): if (algorithm == "knn"): return {'n_neighbors': 9} elif (algorithm == "svm"): return {'kernel': 'poly', 'C': 10**-4} elif (algorithm == "gbc"): return {'n_estimators': 10} elif (algorithm == "rfc"): return { 'max_depth': 8, 'max_features': "auto", 'criterion': "gini" } elif (algorithm == "nn"): return {'hidden_layer_sizes': tuple([64 for _ in range(10)])} else: return self.get_params(self.default) # trains a model using the best parameters and returns the score def train(self, algorithm, imgs, labels): params = self.get_params(algorithm) model = self.get_model(algorithm, params) logging.info("Training %s with the following parameters:" % (algorithm)) logging.info(params) dask_client = Client(DASK_IP_ADRESS) img_train, img_test, lbl_train, lbl_test = train_test_split( self.imgs, self.labels, test_size=0.2) futures_img_train = dask_client.scatter(img_train) futures_img_test = dask_client.scatter(img_test) futures_lbl_train = dask_client.scatter(lbl_train) futures_lbl_test = dask_client.scatter(lbl_test) future_model_fit = dask_client.submit(model.fit, futures_img_train, futures_lbl_train) model = future_model_fit.result() future_score_train = dask_client.submit(model.score, futures_img_train, futures_lbl_train) future_score_test = dask_client.submit(model.score, futures_img_test, futures_lbl_test) score_test = future_score_test.result() score_train = future_score_train.result() logging.info("Training complete, saving model %s to file" % (algorithm)) # saving the model to file with self.hdfs_client.write('/' + str(self.model_folder) + str(algorithm) + ".model") as writer: joblib.dump(model, writer) logging.info("Score on training set: %.4f, score on test set: %.4f" % (score_train, score_test)) return score_train, score_test
print(message) # API communication def fetch(table, year, round): url = ERGAST_ENDPOINT.format(year, round, table) print(f'Fetching URL... {url}') response = urllib.request.urlopen(url).read() print(f'Received response of length {len(response)}') return json.loads(response) print('Step 1') try: print(f'Reading last fetched race from HDFS: {HDFS_LAST_FETCHED_FILE}') with client.read(HDFS_LAST_FETCHED_FILE) as reader: lines = reader.read().decode().split('\n') last_year = int(lines[0].rstrip()) last_round = int(lines[1].rstrip()) print(f'Last fetched race: {last_round}_{last_year}') except Exception as e: log(f'Cannot read last fetched year & round: {e}') sys.exit() print('Step 2') def is_race_available(year, round): j = fetch('results', year, round) total = int(j['MRData']['total']) return total > 0
# -*- coding: utf-8 -*- # # Copyright © 2018 white <*****@*****.**> # # Distributed under terms of the MIT license. """ https://hdfscli.readthedocs.io/en/latest/api.html#module-hdfs.client """ from hdfs import InsecureClient hdfs_url = "http://192.168.30.125:50070" hdfs_user = "******" c = InsecureClient(hdfs_url, user=hdfs_user) c.write("/test_write", data="string") c.delete("/test_write") c.makedirs("/new/path") # 自动递归创建 with c.read("f.txt", encoding="utf-8") as f: content = f.read() c.write("/test.txt", "test string")
# 1. HDFS에 접속 -> 결과 파일 읽기 # 2. 결과 내용을 DataFrame 으로 변환 # 3. Matplotlib 으로 시각화 from hdfs import InsecureClient import pandas as pd import matplotlib.pyplot as plt from io import StringIO # 문자열을 파일로 읽어주는 client = InsecureClient('http://192.168.56.100:50070', user='******') # print(client) # 데이터 읽어오기 with client.read('output/arr_delay_count/part-r-00000', encoding='utf-8') as reader: data = reader.read() # print(data) # data ->str -> stream stream = StringIO(data) df = pd.read_csv(stream, sep='\t', header=None) # print(df) # 가공 # print(df[0].str.split(',')) df['year'] = df[0].str.split(',').str[0] df['month'] = df[0].str.split(',').str[1] # year, month 컬럼을 int로 변환 df['year'] = df['year'].astype('int') df['month'] = df['month'].astype('int')