def normalize_parquet(self, hdfs_path='', sort_col=None): from sparkmodule import PySparkManager """ TROUBLE ISSUES 191022) deprecated: DON'T USE THIS! IT MAY CORRUPT YOUR DATAFRAME! parquet 형식의 spark dataframe을 중복제거, 시간 정렬 등 정규화(normalize)하는 메소드 로깅을 같은 날 데이터를 두번 했다거나 하면 한번씩 normalize 해줘야함 :param hdfs_path: :return: """ if hdfs_path == '': # default path path = self._hdfs_path else: # specific path path = hdfs_path if not sort_col: sort_col = ['station', 'datehour'] else: pass Log.d(self.tag, 'normalizing: read parquet from hdfs... :', path) spdf = PySparkManager().sqlctxt.read.parquet(path) Log.d(self.tag, 'normalizing: remove coupled rows and sort by %s...' % sort_col) spdf_new = spdf.distinct().sort(sort_col).cache() Log.d(self.tag, 'normalizing: write parquet...') spdf_new.write.mode('overwrite').parquet(path)
def pdf2hdfs(self, mode='append', hdfs_path=''): from sparkmodule import PySparkManager """ hdfs에 parquet 형식으로 저장 :param mode: 저장 방식, 'append', 'overwrite' :param hdfs_path: hdfs 경로 지정, 입력 없으면 기본 저장경로 :return: nothing """ if hdfs_path == '': path = self._hdfs_path else: path = hdfs_path try: firstrow = list(self._pdf.iloc[0]) except Exception as e: Log.e(self.tag, 'pdf is empty! : ', e.__class__.__name__) return Log.d(self.tag, 'pdf(%s) -> %s' % (firstrow, path)) # make spark dataframe self._spdf = PySparkManager().sqlctxt.createDataFrame(self._pdf) # append new data try: self._spdf.write.mode(mode).parquet(path) except Exception: Log.e(self.tag, 'cannot append row(s).') self._spdf.show() return Log.d(self.tag, 'parquet write completed.')
def get_last_log_datehour(self, db='hdfs'): from sparkmodule import PySparkManager if db == 'hdfs': spdf_total = PySparkManager().sqlctxt.read.parquet(self._hdfs_path) last_date = spdf_total.sort(spdf_total.datehour.desc()).first() return last_date['datehour'] else: pass
def __init__(self): self.col_list_src = [ 'location', 'station_code', 'station_name', 'datetime', 'so2', 'co', 'o3', 'no2', 'pm10', 'pm25', 'address' ] self.col_list = [ 'location', 'station_code', 'address', 'datetime', 'so2', 'co', 'o3', 'no2', 'pm10', 'pm25' ] self.type_list = [ StringType, IntegerType, StringType, StringType, FloatType, FloatType, FloatType, FloatType, FloatType, FloatType ] self.sc = PySparkManager().sc self.sqlctxt = PySparkManager().sqlctxt self.tag = 'FinalPM'
def pdf2hdfs(self, mode='append', hdfs_path=''): """ hdfs에 parquet 형식으로 저장 :param mode: 저장 방식, 'append', 'overwrite' :param hdfs_path: hdfs 경로 지정, 입력 없으면 기본 저장경로 :return: nothing """ if hdfs_path == '': path = self._hdfs_path else: path = hdfs_path try: Log.d('pdf2hdfs()', 'pdf -> hdfs ::\n', self._pdf.iloc[:30]) except Exception as e: Log.e('pdf is empty! : ', e.__class__.__name__) return # make spark dataframe self._spdf = PySparkManager().sqlctxt.createDataFrame(self._pdf) # append new data self._spdf.write.mode(mode).parquet(path) Log.d('pdf2hdfs()', 'parquet write completed.')
def read_asos(fname): old_columns = [ '지점', '시간', '기온(°C)', '누적강수량(mm)', '풍향(deg)', '풍속(m/s)', '현지기압(hPa)', '해면기압(hPa)', '습도(%)', '일사(MJ/m^2)', '일조(Sec)' ] spdf_asos = PySparkManager().sqlctxt.read \ .option('header', 'true') \ .option('encoding', 'euc-kr') \ .option('mode', 'DROPMALFORMED') \ .csv('file://%s' % fname).cache() new_columns = [ 'station_code', 'datetime', 'temperature', 'accum_precipitation', 'wind_dir', 'wind_speed', 'local_air_pressure', 'sea_air_pressure', 'humidity', 'solar_radiation', 'solar_rad_time' ] spdf_asos_r = renameCols(spdf_asos, old_columns, new_columns) # drop_columns = ['station_code', 'precipitation', 'precipitation_qc', 'cloud_type', 'min_cloud_height'] # spdf_asos_r = dropCols(spdf_asos_r, drop_columns) return spdf_asos_r
class FinalParticulateMatter: def __init__(self): self.col_list_src = [ 'location', 'station_code', 'station_name', 'datetime', 'so2', 'co', 'o3', 'no2', 'pm10', 'pm25', 'address' ] self.col_list = [ 'location', 'station_code', 'address', 'datetime', 'so2', 'co', 'o3', 'no2', 'pm10', 'pm25' ] self.type_list = [ StringType, IntegerType, StringType, StringType, FloatType, FloatType, FloatType, FloatType, FloatType, FloatType ] self.sc = PySparkManager().sc self.sqlctxt = PySparkManager().sqlctxt self.tag = 'FinalPM' def search(self, dirname): # dirname 디렉토리 내의 모든 파일과 디렉토리 이름을 리스트로 반환함 filelist = [] filenames = os.listdir(dirname) for filename in filenames: full_filename = os.path.join(dirname, filename) filelist.append(full_filename) return filelist # 필요성 : 미세먼지 api의 datetime 데이터 중 시간 값은 1~24시로 되어있음, # 이는 해당 1시간 동안 누적한 미세먼지의 양을 의미하나 기존 사용해오던 datehour의 시간형식과 다르므로 # 맞춰줄 필요성이 있음 def _datetime_corrector(self, datetime_int): datetime_s = str(datetime_int) dtdate = datetime.datetime.strptime(datetime_s[:-2], '%Y%m%d') stime = datetime_s[8:] if stime == '24': oneday = datetime.timedelta(days=1) dtdate += oneday stime = '00' sdate = dtdate.strftime('%Y-%m-%d') return str(sdate + ' ' + stime) def xlsx2spdf(self, infilepath: str): data = pd.read_excel(infilepath, encoding='utf-8') # read as pandas dataframe Log.d(self.tag + '.xlsx2spdf()', 'before changing column\n', data.iloc[:2]) # for debug if '망' in data.columns: # if exists column name '망' data = data.drop(['망'], axis=1) # drop it Log.d(self.tag, 'dropped column "망"\n', data.iloc[:2]) # for debug data.columns = self.col_list_src # change column name Log.d(self.tag, 'after changing column\n', data.iloc[:2]) # for debug # correct datetime data['datetime'] = data['datetime'].apply(self._datetime_corrector) df = self.sqlctxt.createDataFrame(data) return df def xlsxdir2parquet(self, dirpath: str, hdfs_outpath: str): from pyspark.sql.functions import udf from pyspark.sql.types import StringType udf_mergeCol = udf(lambda s, t: s + ' ' + t, StringType()) infilelist = self.search(dirpath) # 디렉토리 내에 확정데이터 파일 하나씩 읽어서 merged로 통합시키기 Log.d(self.tag + '.xlsxdir2parquet()', 'target file name:', infilelist[0]) # read xlsx and make spdf merged = self.xlsx2spdf(infilelist[0]) # concatenate two columns merged = merged.withColumn('location', udf_mergeCol('location', 'station_name')) merged = merged.drop('station_name') Log.d(self.tag + '.xlsxdir2parquet()', 'target file converted to spdf') merged.show() for i in range(1, len(infilelist)): Log.d(self.tag + '.xlsxdir2parquet()', 'target file name:', infilelist[i]) # read xlsx and make spdf spdf = self.xlsx2spdf(infilelist[i]) # concatenate two columns spdf = spdf.withColumn('location', udf_mergeCol('location', 'station_name')) spdf = spdf.drop('station_name') # merge spdf merged = merged.union(spdf) Log.d(self.tag + '.xlsxdir2parquet()', 'target file converted to spdf') merged.show() merged.show() merged.write.mode('overwrite').parquet(hdfs_outpath) Log.d(self.tag + '.xlsxdir2parquet()', 'parquet write completed.')
from pyspark.sql.types import StructField, StructType, StringType, DoubleType from sparkmodule import PySparkManager pdf = pd.read_csv('/home/witlab/uvmon_location.csv', encoding='utf-8') lat_col = [] lon_col = [] for i in range(len(pdf)): address = pdf.iloc[i]['address'] Log.d('__main__', 'address:', address) lat, lon = getLatLon(address) lat_col.append(float(lat)) lon_col.append(float(lon)) pdf['lat'] = lat_col pdf['lon'] = lon_col Log.d('__main__', 'pdf:\n', pdf) # create spark dataframe # col : [location,station_code,address ] schema = StructType([ StructField('location', StringType()), StructField('station_code', StringType()), StructField('address', StringType()), StructField('lat', DoubleType()), StructField('lon', DoubleType()), ]) spdf = PySparkManager().sqlctxt.createDataFrame(pdf, schema) spdf.write.mode('overwrite').parquet('hdfs:///nl/kma/uv_location.parquet')
class AbsApi(metaclass=ABCMeta): """ important issues: 1. _pdf, _spdf는 최근 1회 측정 데이터만 담고 있어야함 안 그러면 pdf2hdfs(), pdf2mysql() 등에서 중복 저장할 가능성이 있음 """ _json_dict = {} _pdf = None _spdf = None _debug = False def __init__(self, base_url: str, service_key: str, column_list: list, hdfs_path: str, mysql_conn_args: list, tag=''): self._base_url = base_url self._service_key = service_key self._column = column_list self._hdfs_path = hdfs_path self._mysql_conn_args = mysql_conn_args self.tag = tag @abstractmethod def _make_query_param(self, **kwargs): """ make api query string. 내부 메소드, _req_api()로부터 호출됨 :param kwargs: {'station': 지점명 또는 지역명 'time': 데이터 측정 시간 및 날짜} :return: query string """ pass @abstractmethod def _make_payload(self, **kwargs): """ make api request payload. 내부 메소드, _req_api()로부터 호출됨 :param kwargs: :return: payload dict """ pass def _req_api(self, method: str, query_param: str, payload): """ api에 요청 1회 전송 기능 담당 response 데이터가 없을 때에는 올바로 받을 때까지 반복적으로 request 수행 :param method: HTTP method명. 'get' or 'post' :param query_param: self._make_query_param()의 return값 입력 :param payload: self._make_payload()의 return값 입력 :return: """ json_response = None while not json_response: try: Log.d(self.tag, 'req', method, ':', self._base_url + query_param, 'payload:', str(payload)) if method == 'get': json_response = requests.get(self._base_url + query_param) elif method == 'post': json_response = requests.post(self._base_url + query_param, data=payload) except Exception as e: Log.e(self.tag, '_req_api() : occurred Exception!', e.__class__.__name__) Log.e(self.tag, 'trying to recall api...') continue self._json_dict = json.loads(json_response.text) @abstractmethod def _json2pdf(self, **kwargs): """ _req_api()에 의해 AbsApi 객체 내부변수로 생성된 dict형의 json을 pandas dataframe으로 변환. api마다 json형식이 모두 다르므로 개발자가 직접 구현해주어야함, pdf 객체를 만든 다음엔 AbsApi 객체 내부변수로 저장, 리턴 x :param kwargs: pandas dataframe 생성에 필요한 input data :return: pandas dataframe parse 정상완료 여부 """ pass def pdf2hdfs(self, mode='append', hdfs_path=''): from sparkmodule import PySparkManager """ hdfs에 parquet 형식으로 저장 :param mode: 저장 방식, 'append', 'overwrite' :param hdfs_path: hdfs 경로 지정, 입력 없으면 기본 저장경로 :return: nothing """ if hdfs_path == '': path = self._hdfs_path else: path = hdfs_path try: firstrow = list(self._pdf.iloc[0]) except Exception as e: Log.e(self.tag, 'pdf is empty! : ', e.__class__.__name__) return Log.d(self.tag, 'pdf(%s) -> %s' % (firstrow, path)) # make spark dataframe self._spdf = PySparkManager().sqlctxt.createDataFrame(self._pdf) # append new data try: self._spdf.write.mode(mode).parquet(path) except Exception: Log.e(self.tag, 'cannot append row(s).') self._spdf.show() return Log.d(self.tag, 'parquet write completed.') def pdf2mysql(self, table_name: str, if_exists: str = 'append'): from dbs.mysqlmodule import MysqlManager """ mysql에 테이블 형식으로 저장, 테이블이 있어야 함 (테이블 없을 시 새로 생성 기능도 추가해야 함) :param table_name: 테이블 명, :param if_exists: to_sql() params, ex. 'append', 'replace', 'fail' :return: nothing """ Log.d(self.tag, 'pdf -> mysql :: ' + str(list(self._pdf.iloc[0]))) # connect to mysql mm = MysqlManager() mm.init(self._mysql_conn_args) # write to sql self._pdf.to_sql(name=table_name, con=mm.engine, if_exists=if_exists, index=False) # db close mm.close() Log.d(self.tag, 'mysql write completed.') def pdf2csv(self, out_path: str): """ make pandas dataframe to csv file :return: nothing """ self._pdf.to_csv(out_path, columns=self._column, index=False) @abstractmethod def log(self, db_type: list, mode='append', **kwargs): """ api의 최소 측정 단위를 로깅하는 메소드 :param db_type: 로깅할 db 종류 리스트 :param mode: default 'append' :param kwargs: 필요한 외부 변수 :return: nothing """ pass def normalize_parquet(self, hdfs_path='', sort_col=None): from sparkmodule import PySparkManager """ TROUBLE ISSUES 191022) deprecated: DON'T USE THIS! IT MAY CORRUPT YOUR DATAFRAME! parquet 형식의 spark dataframe을 중복제거, 시간 정렬 등 정규화(normalize)하는 메소드 로깅을 같은 날 데이터를 두번 했다거나 하면 한번씩 normalize 해줘야함 :param hdfs_path: :return: """ if hdfs_path == '': # default path path = self._hdfs_path else: # specific path path = hdfs_path if not sort_col: sort_col = ['station', 'datehour'] else: pass Log.d(self.tag, 'normalizing: read parquet from hdfs... :', path) spdf = PySparkManager().sqlctxt.read.parquet(path) Log.d(self.tag, 'normalizing: remove coupled rows and sort by %s...' % sort_col) spdf_new = spdf.distinct().sort(sort_col).cache() Log.d(self.tag, 'normalizing: write parquet...') spdf_new.write.mode('overwrite').parquet(path) def get_last_log_datehour(self, db='hdfs'): from sparkmodule import PySparkManager if db == 'hdfs': spdf_total = PySparkManager().sqlctxt.read.parquet(self._hdfs_path) last_date = spdf_total.sort(spdf_total.datehour.desc()).first() return last_date['datehour'] else: pass def get_json_dict(self): return self._json_dict def get_pdf(self): return self._pdf def get_spdf(self): return self._spdf