def go_transform(self, parquet_write_mode: str = 'append'): if self.ex_tab_length > 0: df_enc = self.spark.read.parquet(self.tab_for_predict_enc) else: df_enc = False df_to_enc = self.spark.read.parquet(self.tab_for_predict) if not self.dm_length: self.dm_length = df_to_enc.count() self.repart_val = int(self.dm_length / self.batch_size) if self.ex_tab_length == 0: cur_lenght = self.dm_length else: df_to_enc = self.df_to_enc.repartition(self.repart_val.F.col(self.numeric_id_name)).join( df_enc.repartition(self.repart_val.F.col(self.numeric_id_name)), on=[self.numeric_id_name], how='left_anti' ) cur_lenght = df_to_enc.count() n_parts = int(np.ceil(cur_lenght / self.max_batch_size)) if n_parts > 1: parts = [np.round(1 / n_parts, 2), ] * n_parts + [np.round(1 / n_parts, 2)] df_parts = df_to_enc.randomSplit(parts) else: df_parts = [df_to_enc, ] self.print_func(f'dm n_parts {len(df_parts)}') self.print_func('trainsforming...') for df_part in df_parts: self.transform_spark_df( sdf=df_part, path_to_write=self.tab_for_predict_enc, parquet_write_mode=parquet_write_mode, repartition_val=int(cur_lenght / self.batch_size / n_parts), ) self.spark.catalog.clearCache() parquet_write_mode = 'append' # if firts was 'overwrite' sh.hdfs('dfs', '-chmod', '-R', '777', self.tab_for_predict_enc) sh.hdfs('dfs', '-setrep', '-R', '2', self.tab_for_predict_enc) if self.hive_database_for_dms: self.hive_query_func( query=f"drop table if exists {self.hive_database_for_dms}.{self.tab_for_predict_enc.split('/')[-1]}; " f"create external table {self.hive_database_for_dms}.{self.tab_for_predict_enc.split('/')[-1]} " \ f"({','.join([f'{x[0]} {x[1]}' for x in self.spark.read.parquet(self.tab_for_predict_enc).dtypes])}) " \ f"row format serde 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' " \ f"stored as inputformat 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' " \ f"outputformat 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' " \ f"location '{self.tab_for_predict_enc}' " )
def repair_hdfs_table(ps_path, schema, tabname): bash_ls = f'hdfs fsck {ps_path}/{schema}/{tabname} -locations -blocks -files | grep "user.*parquet: CORRUPT"' p = subprocess.Popen(bash_ls, stdout=subprocess.PIPE, shell=True) partitions, _ = p.communicate() partitions = [s.decode('utf-8') for s in partitions.split(b'\n')[:-1]] partitions = [s.split(' ')[0].split('/')[-1][:-1] for s in partitions] for prt in partitions: sh.hdfs('dfs', '-rm', '-skipTrash', f'{ps_path}/{schema}/{tabname}/{prt}') sh.hdfs('hdfs', 'dfs', '-setrep', '2', f'{ps_path}/{schema}/{tabname}')
def score_npv(self, bp): score_tab_name = f'{self.hdfs_path}/{self.scores_folder}/cltv_get_{self.score_date}_npv_{bp}' if self.skip_ex_scores & self.ch_parq_table_func(score_tab_name): self.print_func(f'score {bp} already exists: {score_tab_name}') return if bp != 'dc': encoder_path = os.path.join(self.models_npv_path, bp, f'encoder_{bp}.pkl') encoder_path = encoder_path if os.path.exists( encoder_path) else False self.get_score_func_npv(model_file=os.path.join( self.models_npv_path, bp, f'{bp}.pkl'), model_encoder_file=encoder_path) df_to_save = eval( f"self.df_raw.select(self.numeric_id_name, score_npv_{bp}(*self.used_features).alias('npv_{bp}'))" ) else: encoder_path_dc1 = os.path.join(self.models_npv_path, bp, 'dc1', 'encoder_dc1.pkl') encoder_path_dc2 = os.path.join(self.models_npv_path, bp, 'dc2', 'encoder_dc2.pkl') encoder_path_dc1 = encoder_path_dc1 if os.path.exists( encoder_path_dc1) else False encoder_path_dc2 = encoder_path_dc1 if os.path.exists( encoder_path_dc1) else False self.get_score_func_npv(model_file=os.path.join( self.models_npv_path, bp, 'dc1', 'dc1.pkl'), model_encoder_file=encoder_path_dc1) used_features_dc1 = self.used_features self.get_score_func_npv(model_file=os.path.join( self.models_npv_path, bp, 'dc2', 'dc2.pkl'), model_encoder_file=encoder_path_dc2) used_features_dc2 = self.used_features df_to_save = eval( f'''self.df.withColumn('nmb_group', F.lit(1)).select( "{self.numeric_id_name}", F.when(F.col("prd_crd_dc_active_qty") > 0, score_npv_dc2(*used_features_dc2))\ .otherwise(score_npv_dc1(*used_features_dc1))\ .alias('npv_{bp}') )''') df_to_save = df_to_save.repartition( int(np.ceil(self.dm_length / self.bucket_size / 3))) df_to_save.write.option( 'compression', 'none').mode('overwrite').parquet(score_tab_name) self.print_func(f'score {bp} recorded in {score_tab_name}') sh.hdfs('dfs', '-setrep', '-R', '2', score_tab_name) sh.hdfs('dfs', '-chmod', '-R', '777', score_tab_name)
def get_repartition_value(sdf: pyspark.sql.dataframe.DataFrame, target_size: int = 245, compression: str = 'none') -> int: lenght = sdf.count() df_1_row = sdf.limit(int(1e4)) tmp_file_name = 'test_file' while check_hdfs_file_ex(tmp_file_name): tmp_file_name += '_' df_1_row.coalesce(1).write.option('compression', compression)\ .mode('overwrite').parquet(tmp_file_name) row_byte_weight = int(sh.hdfs('dfs', '-du', tmp_file_name)\ .stdout.decode('utf-8').split('\n')[-2].split(' ')[0]) sh.hdfs('dfs', '-rm', '-R', '-skipTrash', tmp_file_name) nd_rep_val = int(row_byte_weight * lenght / target_size / (1024 * 1024) / 1e4) return 1 if nd_rep_val < 1 else nd_rep_val
def _list_hdfs_content(path): """ [Not currently used] Utility to get a list of the items within a HDFS path :param path: A valid HDFS folder path :return: List of the contained items as string routes """ return [line.rsplit(None, 1)[-1] for line in sh.hdfs('dfs', '-ls', path).split('\n') if len(line.rsplit(None, 1))][1:]
def getFilelist(path_hdfs): files=[] commands = sh.hdfs('dfs', '-ls', path_hdfs).split('\n') for c in commands[1: len(commands)-1]: files.append(c.rsplit(None,1)[-1]) print("Files found:") for f in files: print(f) return (files)
def get_parquets_from_sdf(sdf: pyspark.sql.dataframe.DataFrame): name = 'tmp_file' + f'{os.getpid()}_{socket.gethostname().replace(".", "")}' while os.path.exists(name): name += '_' if check_hdfs_file_ex(name): sh.hdfs('dfs', '-rm', '-r', '-skipTrash', '{}'.format(name)) for column in sdf.dtypes: if 'date' in column[1]: sdf = sdf.withColumn( column[0], F.col(column[0]).cast(T.TimestampType()).alias(column[0])) sdf.write.mode('overwrite').parquet(name) sh.hdfs('dfs', '-get', '{}'.format(name), '{}'.format(os.getcwd())) sh.hdfs('dfs', '-rm', '-r', '-skipTrash', '{}'.format(name)) data = pd.read_parquet(name + '/') os.system(f'rm -r {os.getcwd()}/{name}') return data
import pyspark.sql.functions as F from pyspark.mllib.clustering import KMeans from numpy import array from math import sqrt #Note: you need to change the path to the spark repository, to HDFS and also the master's URL findspark.init("/ <path_to_spark> /spark") conf = SparkConf().setAppName("SO_project").setMaster( "spark:// <master_name> :7077") sc = SparkContext(conf=conf) sqlContext = sql.SQLContext(sc) hdfsdir = '/ <path_to_hdfs> /hdfs/dataset' files = [ line.rsplit(None, 1)[-1] for line in sh.hdfs('dfs', '-ls', hdfsdir).split('\n') if len(line.rsplit(None, 1)) ][2:] def apply_preprocessing(rdd): ''' This function applies some transformations in order to have a dataset with this shape: ((plate,gate)) - **parameters**, **types**, **return** and **return types**:: :param rdd: RDD to transform :type rdd: pyspark.rdd.RDD :return: return the transformed RDD :rtype: pyspark.rdd.RDD ''' header = rdd.first() rdd = rdd.filter(lambda lines: lines != header)
def save_sdf_to_ps(sdf: pyspark.sql.dataframe.DataFrame or bool = False, table_name: str = 'new_tab', cur_path: str or bool = False, overwrite: bool = True, hive_schema: str = 'default', ps_folder: str = '', parquet_write_mode: str = 'overwrite', parquet_compression: str = 'none', ps_path: str = 'hdfs://clsklsbx/user/team/team_ds_cltv/'): """sdf - Spark DataFrame to save table_name - new table name in Hive overwrite - overwriting Hive table if it exists hive_schema - name of Hive db ps_folder - directory in "Persistent Storage" to save ps_path - hdfs-link to our "Persistent Storage" cur_path - if files exist, we only creating external table """ tab_name = f'{hive_schema}.{table_name}' existence = check_hive_table_existence(tab_name) ps_folder = hive_schema if len(ps_folder) == 0 else ps_folder final_path = f'{ps_path}{ps_folder}' table_path = f'{final_path}/{table_name}' if any([not existence, overwrite]): if existence: if not cur_path: sh.hadoop('fs', '-rm', '-skipTrash', '-r', table_path) else: sh.hadoop('distcp', cur_path, new_path) sh.hadoop('fs', '-rm', '-skipTrash', '-r', table_path) drop_hive_table(tab_name, False) else: print(f'{tab_name} already exists') return None if cur_path: sdf = spark.read.parquet(cur_path) table_path = cur_path for column in sdf.dtypes: if 'date' in column[1]: sdf = sdf.withColumn( column[0], F.col(column[0]).cast(T.TimestampType()).alias(column[0])) if not cur_path: if len(ps_folder) > 0: hadoop_folders = list( filter(lambda x: len(x) > 1, sh.hadoop('fs', '-ls', '-C', ps_path).split('\n'))) hadoop_folders = [x.split('/')[-1] for x in hadoop_folders] if not any([x == ps_folder for x in hadoop_folders]): sh.hadoop('fs', '-mkdir', final_path) sh.hdfs('dfs', '-chmod', '-R', '777', final_path) sdf.write.option('compression', parquet_compression) \ .mode(parquet_write_mode).parquet(table_path) sh.hdfs('dfs', '-setrep', '-R', '2', table_path) send_beeline_query( query=f"create external table {tab_name} " \ f"({','.join([f'{x[0]} {x[1]}' for x in sdf.dtypes])}) " \ f"row format serde 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' " \ f"stored as inputformat 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' " \ f"outputformat 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' " \ f"location '{table_path}' ", print_output=False ) sh.hdfs('dfs', '-chmod', '-R', '777', table_path) print(f'{tab_name} created, files based in {table_path}')
def __init__(self, spark: pyspark.sql.session.SparkSession, hdfs_path: str, scores_folder: str, score_date: str, models_path: str, models_npv_path: str, numeric_id_name: str, skip_ex_scores: bool, tab_for_predict_enc: str, tab_for_predict_raw: str, periods: str, print_func: Any, ch_file_ex_func: Any, ch_parq_table_func: Any, needed_bps_npv: list, bucket_size: int, periods_file='periods.xlsx', needed_couples=False): self.spark = spark self.sc = self.spark.sparkContext self.hdfs_path = hdfs_path self.scores_folder = scores_folder self.score_date = score_date self.models_path = models_path self.models_npv_path = models_npv_path self.numeric_id_name = numeric_id_name self.skip_ex_scores = skip_ex_scores self.tab_for_predict_enc = tab_for_predict_enc self.periods = periods self.print_func = print_func self.problem_models, self.scored_models = [], [] self.df = self.spark.read.parquet(tab_for_predict_enc) self.df_raw = self.spark.read.parquet(tab_for_predict_raw) self.dm_length = self.df.count() self.periods_file = periods_file self.used_features = [] self.needed_bps_npv = needed_bps_npv self.bucket_size = bucket_size self.ch_parq_table_func = ch_parq_table_func self.ch_file_ex_func = ch_file_ex_func self.needed_couples = needed_couples self.sc.setLogLevel('FATAL') if not self.periods: self.periods = pd.read_excel(periods_file) self.periods['model'] = self.periods.apply( lambda x: f"{x['bp']}_{x['sp']}", axis=1) self.periods = self.periods[['model', 'T_min', 'T_max' ]].set_index('model').T.to_dict() self.periods_setted = None else: self.periods_setted = {'T_min': periods[0], 'T_max': periods[-1]} if not needed_couples: try: config_file = f'{self.hdfs_path}/score_config/cltv_resp_product_pairs_to_score.csv' df = self.spark.read.option('delimiter', ';').csv( config_file, header=True).toPandas().fillna(-1).set_index( 'BP').astype('int') self.needed_couples = [] for col in df.columns[1:]: self.needed_couples.extend([ (bp, col, priority) for bp, priority in df[ df[col] > 0][['priority']].reset_index().values if priority > 0 ]) self.needed_couples = pd.DataFrame(self.needed_couples, columns=['bp', 'sp', 'p'])\ .sort_values(['p', 'bp', 'sp'])[['bp', 'sp']].values.tolist() self.needed_couples = list(map(tuple, self.needed_couples)) except Exception as E: self.print_func( f'problem with reading config {config_file}: {E}') sys.exit() self.needed_bps_npv.sort() self.score_f = '''@F.pandas_udf(T.ArrayType(T.DoubleType())) def uplifts_{bp}_{sp}(*cols): if not os.path.exists('dspl'): import zipfile with zipfile.ZipFile(f'dspl.zip', 'r') as z: z.extractall('dspl') with open('{model_file_f}', 'rb') as f: model = pickle.load(f) df = pd.concat(cols, axis = 1).astype('float32') del cols gc.collect() l = df.shape[0] df.columns = {used_features} if str(model).startswith('CBClassifier'): cut_ct_ft = list(set(model.categorical_features)&set(df.columns)) if len(cut_ct_ft) > 0: df[cut_ct_ft] = df[cut_ct_ft].astype('int64') output = pd.DataFrame() nans = np.empty(l) nans[:] = np.nan for t in range(0, 13): if ('{bp}'=='{sp}' or ('{bp}'=="kp" and '{sp}'=="pl")) and t==0: output[str(t)] = nans elif t in range({start_t}, {end_t}): df['T'] = t upl = [] for prchs in range(2): df['{bp}'] = prchs upl.append(np.round(model.transform(df).astype('float32'),5)) output[str(t)] = upl[1] - upl[0] else: output[str(t)] = nans return pd.Series(output[[str(x) for x in sorted([int(x) for x in output.columns.tolist()]) ]].values.tolist())''' self.score_npv_f = '''@F.pandas_udf(T.DoubleType()) def score_npv_{model_name}(*cols): if not os.path.exists('dspl'): import zipfile with zipfile.ZipFile(f'dspl.zip', 'r') as z: z.extractall('dspl') with open('{model_file_f}', 'rb') as f: model = pickle.load(f) df = pd.concat(cols, axis = 1) del cols gc.collect() df.columns = {used_features} if '{encoder_file}' != 'False': with open('{encoder_file}', 'rb') as f: encoder = pickle.load(f) df = encoder.transform(df) gc.collect() return pd.Series(model.transform(df.astype('float32'))).round(2)''' if not self.ch_file_ex_func(f'{hdfs_path}{self.scores_folder}'): sh.hdfs('dfs', '-mkdir', f'{hdfs_path}{self.scores_folder}') sh.hdfs('dfs', '-chmod', '-R', '777', f'{hdfs_path}{self.scores_folder}') self.dm_length = self.df.count() self.print_func(f'dm_length: {self.dm_length}')
def score_couple(self, bp, sp): if (self.skip_ex_scores) & (f'{bp}_{sp}' in self.scored_models): return self.print_func(f'Starting to score {bp}_{sp}') hadoop_path = f'{self.hdfs_path}{self.scores_folder}' score_tab_name = f'{hadoop_path}/uplifts_{bp}_{sp}_{self.score_date}' aggr_scores_name = f'{hadoop_path}/uplifts_{bp}_{self.score_date}' aggr_scores_step_names = [ f'{hadoop_path}/step_uplifts_{bp}_{self.score_date}_{i}' for i in range(6) ] if (self.skip_ex_scores) & (self.ch_parq_table_func(score_tab_name)): self.print_func( f'score {bp}_{sp} already exists: {score_tab_name} or added in {aggr_scores_name}' ) self.spark.catalog.clearCache() return else: if self.ch_parq_table_func(aggr_scores_name): self.scored_models.extend([ x for x in self.spark.read.parquet( aggr_scores_name).columns[1:-1] if x not in self.scored_models ]) if f'{bp}_{sp}' in self.scored_models: self.print_func( f'scores {",".join([x for x in self.scored_models if x.startswith(bp+"_")])} '\ f'already added in {aggr_scores_name}' ) self.spark.catalog.clearCache() return rdy_couples = [] for t in aggr_scores_step_names: if self.ch_parq_table_func(t): rdy_couples.extend([ x for x in self.spark.read.parquet(t).columns[1:-1] if x not in self.scored_models ]) self.scored_models.extend(rdy_couples) else: break if len(rdy_couples) != 0: if f'{bp}_{sp}' in rdy_couples: self.print_func( f'score {bp}_{sp} already added in {t}') self.spark.catalog.clearCache() return if bp != 'dc1': model_file = f'{self.models_path}/{bp}_{sp}.pkl' # SL model_file_f = f'{bp}_{sp}.pkl' score_f = self.get_score_func(model_file) new_col_name = f'{bp}_{sp}' df_to_save = eval(f'''self.df.select( "{self.numeric_id_name}", uplifts_{bp}_{sp}(*[F.col(x) for x in self.used_features]).alias("{bp}_{sp}") ).repartition(int(np.ceil(self.dm_length/self.bucket_size)))''' ) else: model_file_f = f'{bp}_{sp}.pkl' uplifts_dc2p = self.get_score_func( f'{self.models_path}/dc2p_{sp}.pkl') used_features_dc2p = [F.col(x) for x in self.used_features] uplifts_dc1 = self.get_score_func( f'{self.models_path}/dc1_{sp}.pkl') used_features_dc1 = [F.col(x) for x in self.used_features] df_to_save = eval(f'''self.df.select( "{self.numeric_id_name}", F.when(F.col("prd_crd_dc_active_qty") > 0, uplifts_dc2p_{sp}(*used_features_dc2p))\ .otherwise(uplifts_dc1_{sp}(*used_features_dc1))\ .alias("{bp}_{sp}") ).repartition(int(np.ceil(self.dm_length/self.bucket_size)))''' ) self.print_func(f'scoring {bp}_{sp}') df_to_save = df_to_save.repartition( int(np.ceil(self.dm_length / self.bucket_size / 2.5))) df_to_save.write.option( 'compression', 'none').mode('overwrite').parquet(score_tab_name) sh.hdfs('dfs', '-setrep', '-R', '2', score_tab_name) sh.hdfs('dfs', '-chmod', '-R', '777', score_tab_name) self.print_func(f'scores {bp}_{sp} recorded to {score_tab_name}') gc.collect() self.spark.catalog.clearCache() self.scored_models.append(f'{bp}_{sp}') self.print_func( f'{round((self.needed_couples.index((bp, sp)) + 1) / len(self.needed_couples) * 100, 2)}' \ f'% of response models done' )
def build_mdm_slice(self): if self.score_on_ready_data_mart: if not self.ch_parq_table_func(self.tab_for_predict): out = f'ERROR: specified dm {self.tab_for_predict} does not exist' self.print_func(out) raise Exception(out) elif not self.ch_parq_table_func(self.tab_for_predict): self.print_func(f'building dm {self.tab_for_predict}') if self.dm_filter_cond: self.dm_filter_cond += ' and ' else: self.dm_filter_cond = '' df_dm = self.spark.read.parquet(f'{self.dm_path}').filter( f'{self.dm_periods_column_name} in ({self.get_date_partition(max(self.periods_from_dm))})' ).select([self.numeric_id_name, *list(set(self.cols_to_avg) | set(self.f_cols))]) df_1 = df_dm.filter( f'{self.dm_filter_cond} {self.dm_periods_column_name} = \'{self.slice_date}\' ' ) self.dm_length = df_1.count() self.print_func(f'slice length: {self.dm_length}') if len(self.cols_to_avg) > 0: arrg_features = [ df_dm.filter( f'{self.dm_filter_cond} {self.dm_periods_column_name} in ' \ f'({self.get_date_partition(period)})' ).select([self.numeric_id_name, *self.cols_to_avg]) \ .groupby(self.numeric_id_name) \ .agg(*[ F.avg(col).alias(f'{col}_{period}a') for col in self.cols_to_avg ]) for period in self.periods_from_dm ] else: arrg_features = [] self.repart_val = int(self.dm_length / self.batch_size) repart_dfs = [ x.repartition(self.repart_val, F.col(self.numeric_id_name)) for x in (df_1, *arrg_features) ] df = repart_dfs.pop(0) for another_df in repart_dfs: df = df.join(another_df, on=self.numeric_id_name) df.repartition(self.repart_val) \ .write.mode('overwrite') \ .option('compression', 'none') \ .parquet(self.tab_for_predict) self.spark.catalog.clearCache() sh.hdfs('dfs', '-chmod', '-R', '777', self.tab_for_predict) sh.hdfs('dfs', '-setrep', '-R', '2', self.tab_for_predict) else: self.print_func(f'dm {self.dm_name} already exists') if self.hive_database_for_dms: self.hive_query_func( query=f"create database if not exists {self.hive_database_for_dms}; " f"drop table if exists {self.hive_database_for_dms}.{self.dm_name}; " f"create external table {self.hive_database_for_dms}.{self.dm_name} " \ f"({','.join([f'{x[0]} {x[1]}' for x in self.spark.read.parquet(self.tab_for_predict).dtypes])}) " \ f"row format serde 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' " \ f"stored as inputformat 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' " \ f"outputformat 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' " \ f"location '{self.tab_for_predict}' " ) self.print_func(f'table {self.hive_database_for_dms}.{self.dm_name} created in hive')
findspark.init("/opt/spark") from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext from pyspark import sql conf = SparkConf().setAppName("SO_project").setMaster("spark://damiani-master-slave-0:7077") sc = SparkContext(conf = conf) sqlContext = sql.SQLContext(sc) # In[7]: import sh hdfsdir = '/user/ubuntu/hdfs/dataset' files = [ line.rsplit(None,1)[-1] for line in sh.hdfs('dfs','-ls',hdfsdir).split('\n') if len(line.rsplit(None,1))][2:] # In[8]: rdd = sc.textFile(files[0]) # In[9]: rdd.take(5) # In[10]: