def drop_hive_table(tab_name, and_folder=True, in_ps=False): global spark if check_hive_table_existence(tab_name): if all((in_ps, and_folder)): cr_tb_data = spark.sql( f'show create table {tab_name}').collect()[0][0] spark.sql(f'show create table {tab_name}') tab_location = get_table_path(tab_name) sh.hadoop('fs', '-rm', '-skipTrash', '-r', tab_location) spark.sql(f'drop table if exists {tab_name}') if and_folder and not in_ps: os.system( 'hdfs dfs -rm -r -skipTrash '\ '/user/hive/warehouse/{}.db/{}'.format(*tab_name.split('.')) )
def main(): options,args = parser.parse_args(sys.argv[1:]) if len(args) < 3: parser.print_help() sys.exit(45) run=args[0] is2 = int(args[1]) ie_or_is2n = int(args[2]) conf=shapesim.read_config(run) simconf = shapesim.read_config(conf['sim']) pattern=shapesim.get_output_url(run, is2, ie_or_is2n, itrial='*', fs='hdfs') flist=awk(hadoop('fs','-ls',pattern), '{print $8}').split() nring = simconf['nring'] for i in xrange(nring): f=shapesim.get_output_url(run, is2, ie_or_is2n, itrial=i, fs='hdfs') f=f.replace('hdfs://','') if f not in flist: print f
def ls(hdfsPath): try: return (0,sh.hadoop("fs","-ls",hdfsPath)) except sh.ErrorReturnCode as e: return (-1,e.stderr)
def size(hdfsPath): try: return (0,sh.hadoop("fs","-du","-h",hdfsPath)) except sh.ErrorReturnCode as e: print e return (-1,e.stderr)
def setrep(repFactor,dirName): try: return (0,sh.hadoop("fs","-setrep","-R",repFactor,dirName)) except sh.ErrorReturnCode as e: print e return (-1,e.stderr)
def runTable(jarFile, scale, base, tableName): try: return (0, sh.hadoop("jar", jarFile, "-d", base + "/" + str(scale) + "/", "-s", scale, "-t", tableName)) except sh.ErrorReturnCode as e: print e return (-1, e.stderr)
def mkdir(hdfsPath): try: return (0,sh.hadoop("fs","-mkdir","-p",hdfsPath)) except sh.ErrorReturnCode as e: return (-1,e.stderr)
def save_sdf_to_ps(sdf: pyspark.sql.dataframe.DataFrame or bool = False, table_name: str = 'new_tab', cur_path: str or bool = False, overwrite: bool = True, hive_schema: str = 'default', ps_folder: str = '', parquet_write_mode: str = 'overwrite', parquet_compression: str = 'none', ps_path: str = 'hdfs://clsklsbx/user/team/team_ds_cltv/'): """sdf - Spark DataFrame to save table_name - new table name in Hive overwrite - overwriting Hive table if it exists hive_schema - name of Hive db ps_folder - directory in "Persistent Storage" to save ps_path - hdfs-link to our "Persistent Storage" cur_path - if files exist, we only creating external table """ tab_name = f'{hive_schema}.{table_name}' existence = check_hive_table_existence(tab_name) ps_folder = hive_schema if len(ps_folder) == 0 else ps_folder final_path = f'{ps_path}{ps_folder}' table_path = f'{final_path}/{table_name}' if any([not existence, overwrite]): if existence: if not cur_path: sh.hadoop('fs', '-rm', '-skipTrash', '-r', table_path) else: sh.hadoop('distcp', cur_path, new_path) sh.hadoop('fs', '-rm', '-skipTrash', '-r', table_path) drop_hive_table(tab_name, False) else: print(f'{tab_name} already exists') return None if cur_path: sdf = spark.read.parquet(cur_path) table_path = cur_path for column in sdf.dtypes: if 'date' in column[1]: sdf = sdf.withColumn( column[0], F.col(column[0]).cast(T.TimestampType()).alias(column[0])) if not cur_path: if len(ps_folder) > 0: hadoop_folders = list( filter(lambda x: len(x) > 1, sh.hadoop('fs', '-ls', '-C', ps_path).split('\n'))) hadoop_folders = [x.split('/')[-1] for x in hadoop_folders] if not any([x == ps_folder for x in hadoop_folders]): sh.hadoop('fs', '-mkdir', final_path) sh.hdfs('dfs', '-chmod', '-R', '777', final_path) sdf.write.option('compression', parquet_compression) \ .mode(parquet_write_mode).parquet(table_path) sh.hdfs('dfs', '-setrep', '-R', '2', table_path) send_beeline_query( query=f"create external table {tab_name} " \ f"({','.join([f'{x[0]} {x[1]}' for x in sdf.dtypes])}) " \ f"row format serde 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' " \ f"stored as inputformat 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' " \ f"outputformat 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' " \ f"location '{table_path}' ", print_output=False ) sh.hdfs('dfs', '-chmod', '-R', '777', table_path) print(f'{tab_name} created, files based in {table_path}')