Пример #1
0
    def go_transform(self, parquet_write_mode: str = 'append'):

        if self.ex_tab_length > 0:
            df_enc = self.spark.read.parquet(self.tab_for_predict_enc)
        else:
            df_enc = False

        df_to_enc = self.spark.read.parquet(self.tab_for_predict)

        if not self.dm_length:
            self.dm_length = df_to_enc.count()
            self.repart_val = int(self.dm_length / self.batch_size)

        if self.ex_tab_length == 0:
            cur_lenght = self.dm_length
        else:
            df_to_enc = self.df_to_enc.repartition(self.repart_val.F.col(self.numeric_id_name)).join(
                df_enc.repartition(self.repart_val.F.col(self.numeric_id_name)),
                on=[self.numeric_id_name],
                how='left_anti'
            )
            cur_lenght = df_to_enc.count()

        n_parts = int(np.ceil(cur_lenght / self.max_batch_size))
        if n_parts > 1:
            parts = [np.round(1 / n_parts, 2), ] * n_parts + [np.round(1 / n_parts, 2)]
            df_parts = df_to_enc.randomSplit(parts)
        else:
            df_parts = [df_to_enc, ]

        self.print_func(f'dm n_parts {len(df_parts)}')
        self.print_func('trainsforming...')

        for df_part in df_parts:
            self.transform_spark_df(
                sdf=df_part,
                path_to_write=self.tab_for_predict_enc,
                parquet_write_mode=parquet_write_mode,
                repartition_val=int(cur_lenght / self.batch_size / n_parts),
            )
            self.spark.catalog.clearCache()
            parquet_write_mode = 'append'  # if firts was 'overwrite'

        sh.hdfs('dfs', '-chmod', '-R', '777', self.tab_for_predict_enc)
        sh.hdfs('dfs', '-setrep', '-R', '2', self.tab_for_predict_enc)

        if self.hive_database_for_dms:
            self.hive_query_func(
                query=f"drop table if exists {self.hive_database_for_dms}.{self.tab_for_predict_enc.split('/')[-1]}; "
                      f"create external table {self.hive_database_for_dms}.{self.tab_for_predict_enc.split('/')[-1]} " \
                      f"({','.join([f'{x[0]} {x[1]}' for x in self.spark.read.parquet(self.tab_for_predict_enc).dtypes])}) " \
                      f"row format serde 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' " \
                      f"stored as inputformat 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' " \
                      f"outputformat 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' " \
                      f"location '{self.tab_for_predict_enc}' "
            )
def repair_hdfs_table(ps_path, schema, tabname):

    bash_ls = f'hdfs fsck {ps_path}/{schema}/{tabname} -locations -blocks -files | grep "user.*parquet: CORRUPT"'
    p = subprocess.Popen(bash_ls, stdout=subprocess.PIPE, shell=True)
    partitions, _ = p.communicate()
    partitions = [s.decode('utf-8') for s in partitions.split(b'\n')[:-1]]
    partitions = [s.split(' ')[0].split('/')[-1][:-1] for s in partitions]
    for prt in partitions:
        sh.hdfs('dfs', '-rm', '-skipTrash',
                f'{ps_path}/{schema}/{tabname}/{prt}')
    sh.hdfs('hdfs', 'dfs', '-setrep', '2', f'{ps_path}/{schema}/{tabname}')
Пример #3
0
    def score_npv(self, bp):
        score_tab_name = f'{self.hdfs_path}/{self.scores_folder}/cltv_get_{self.score_date}_npv_{bp}'

        if self.skip_ex_scores & self.ch_parq_table_func(score_tab_name):
            self.print_func(f'score {bp} already exists: {score_tab_name}')
            return

        if bp != 'dc':
            encoder_path = os.path.join(self.models_npv_path, bp,
                                        f'encoder_{bp}.pkl')
            encoder_path = encoder_path if os.path.exists(
                encoder_path) else False
            self.get_score_func_npv(model_file=os.path.join(
                self.models_npv_path, bp, f'{bp}.pkl'),
                                    model_encoder_file=encoder_path)

            df_to_save = eval(
                f"self.df_raw.select(self.numeric_id_name, score_npv_{bp}(*self.used_features).alias('npv_{bp}'))"
            )
        else:
            encoder_path_dc1 = os.path.join(self.models_npv_path, bp, 'dc1',
                                            'encoder_dc1.pkl')
            encoder_path_dc2 = os.path.join(self.models_npv_path, bp, 'dc2',
                                            'encoder_dc2.pkl')
            encoder_path_dc1 = encoder_path_dc1 if os.path.exists(
                encoder_path_dc1) else False
            encoder_path_dc2 = encoder_path_dc1 if os.path.exists(
                encoder_path_dc1) else False
            self.get_score_func_npv(model_file=os.path.join(
                self.models_npv_path, bp, 'dc1', 'dc1.pkl'),
                                    model_encoder_file=encoder_path_dc1)
            used_features_dc1 = self.used_features
            self.get_score_func_npv(model_file=os.path.join(
                self.models_npv_path, bp, 'dc2', 'dc2.pkl'),
                                    model_encoder_file=encoder_path_dc2)
            used_features_dc2 = self.used_features
            df_to_save = eval(
                f'''self.df.withColumn('nmb_group', F.lit(1)).select(
                    "{self.numeric_id_name}",
                    F.when(F.col("prd_crd_dc_active_qty") > 0, score_npv_dc2(*used_features_dc2))\
                    .otherwise(score_npv_dc1(*used_features_dc1))\
                    .alias('npv_{bp}')
                )''')
            df_to_save = df_to_save.repartition(
                int(np.ceil(self.dm_length / self.bucket_size / 3)))
            df_to_save.write.option(
                'compression',
                'none').mode('overwrite').parquet(score_tab_name)
            self.print_func(f'score {bp} recorded in {score_tab_name}')
            sh.hdfs('dfs', '-setrep', '-R', '2', score_tab_name)
            sh.hdfs('dfs', '-chmod', '-R', '777', score_tab_name)
def get_repartition_value(sdf: pyspark.sql.dataframe.DataFrame,
                          target_size: int = 245,
                          compression: str = 'none') -> int:
    lenght = sdf.count()
    df_1_row = sdf.limit(int(1e4))
    tmp_file_name = 'test_file'
    while check_hdfs_file_ex(tmp_file_name):
        tmp_file_name += '_'
    df_1_row.coalesce(1).write.option('compression', compression)\
        .mode('overwrite').parquet(tmp_file_name)
    row_byte_weight = int(sh.hdfs('dfs', '-du', tmp_file_name)\
        .stdout.decode('utf-8').split('\n')[-2].split(' ')[0])
    sh.hdfs('dfs', '-rm', '-R', '-skipTrash', tmp_file_name)
    nd_rep_val = int(row_byte_weight * lenght / target_size / (1024 * 1024) /
                     1e4)
    return 1 if nd_rep_val < 1 else nd_rep_val
Пример #5
0
    def _list_hdfs_content(path):
        """
        [Not currently used]
        Utility to get a list of the items within a HDFS path

        :param path: A valid HDFS folder path
        :return: List of the contained items as string routes
        """

        return [line.rsplit(None, 1)[-1] for line in sh.hdfs('dfs', '-ls', path).split('\n') if
                len(line.rsplit(None, 1))][1:]
Пример #6
0
def getFilelist(path_hdfs):
	files=[]
	commands = sh.hdfs('dfs', '-ls', path_hdfs).split('\n')
	for c in commands[1: len(commands)-1]:
		files.append(c.rsplit(None,1)[-1])
	
	print("Files found:")
	for f in files:
		print(f)

	return (files)	
def get_parquets_from_sdf(sdf: pyspark.sql.dataframe.DataFrame):
    name = 'tmp_file' + f'{os.getpid()}_{socket.gethostname().replace(".", "")}'
    while os.path.exists(name):
        name += '_'
    if check_hdfs_file_ex(name):
        sh.hdfs('dfs', '-rm', '-r', '-skipTrash', '{}'.format(name))
    for column in sdf.dtypes:
        if 'date' in column[1]:
            sdf = sdf.withColumn(
                column[0],
                F.col(column[0]).cast(T.TimestampType()).alias(column[0]))
    sdf.write.mode('overwrite').parquet(name)
    sh.hdfs('dfs', '-get', '{}'.format(name), '{}'.format(os.getcwd()))
    sh.hdfs('dfs', '-rm', '-r', '-skipTrash', '{}'.format(name))
    data = pd.read_parquet(name + '/')
    os.system(f'rm -r {os.getcwd()}/{name}')
    return data
Пример #8
0
import pyspark.sql.functions as F
from pyspark.mllib.clustering import KMeans
from numpy import array
from math import sqrt

#Note: you  need to change the path to the spark repository, to HDFS and also the master's URL

findspark.init("/ <path_to_spark> /spark")
conf = SparkConf().setAppName("SO_project").setMaster(
    "spark:// <master_name> :7077")
sc = SparkContext(conf=conf)
sqlContext = sql.SQLContext(sc)
hdfsdir = '/ <path_to_hdfs> /hdfs/dataset'
files = [
    line.rsplit(None, 1)[-1]
    for line in sh.hdfs('dfs', '-ls', hdfsdir).split('\n')
    if len(line.rsplit(None, 1))
][2:]


def apply_preprocessing(rdd):
    '''
    This function applies some transformations in order to have a dataset with this shape: ((plate,gate)) 
     - **parameters**, **types**, **return** and **return types**::
          :param rdd: RDD to transform
          :type rdd: pyspark.rdd.RDD
          :return: return the transformed RDD 
          :rtype: pyspark.rdd.RDD
    '''
    header = rdd.first()
    rdd = rdd.filter(lambda lines: lines != header)
def save_sdf_to_ps(sdf: pyspark.sql.dataframe.DataFrame or bool = False,
                   table_name: str = 'new_tab',
                   cur_path: str or bool = False,
                   overwrite: bool = True,
                   hive_schema: str = 'default',
                   ps_folder: str = '',
                   parquet_write_mode: str = 'overwrite',
                   parquet_compression: str = 'none',
                   ps_path: str = 'hdfs://clsklsbx/user/team/team_ds_cltv/'):
    """sdf - Spark DataFrame to save
    table_name - new table name in Hive
    overwrite - overwriting Hive table if it exists
    hive_schema - name of Hive db
    ps_folder - directory in "Persistent Storage" to save
    ps_path - hdfs-link to our "Persistent Storage"
    cur_path - if files exist, we only creating external table
    """
    tab_name = f'{hive_schema}.{table_name}'
    existence = check_hive_table_existence(tab_name)
    ps_folder = hive_schema if len(ps_folder) == 0 else ps_folder
    final_path = f'{ps_path}{ps_folder}'
    table_path = f'{final_path}/{table_name}'

    if any([not existence, overwrite]):
        if existence:
            if not cur_path:
                sh.hadoop('fs', '-rm', '-skipTrash', '-r', table_path)
            else:
                sh.hadoop('distcp', cur_path, new_path)
                sh.hadoop('fs', '-rm', '-skipTrash', '-r', table_path)
            drop_hive_table(tab_name, False)
    else:
        print(f'{tab_name} already exists')
        return None

    if cur_path:
        sdf = spark.read.parquet(cur_path)
        table_path = cur_path

    for column in sdf.dtypes:
        if 'date' in column[1]:
            sdf = sdf.withColumn(
                column[0],
                F.col(column[0]).cast(T.TimestampType()).alias(column[0]))
    if not cur_path:

        if len(ps_folder) > 0:
            hadoop_folders = list(
                filter(lambda x: len(x) > 1,
                       sh.hadoop('fs', '-ls', '-C', ps_path).split('\n')))
            hadoop_folders = [x.split('/')[-1] for x in hadoop_folders]
            if not any([x == ps_folder for x in hadoop_folders]):
                sh.hadoop('fs', '-mkdir', final_path)
                sh.hdfs('dfs', '-chmod', '-R', '777', final_path)

        sdf.write.option('compression', parquet_compression) \
            .mode(parquet_write_mode).parquet(table_path)

    sh.hdfs('dfs', '-setrep', '-R', '2', table_path)

    send_beeline_query(
        query=f"create external table {tab_name} " \
              f"({','.join([f'{x[0]} {x[1]}' for x in sdf.dtypes])}) " \
              f"row format serde 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' " \
              f"stored as inputformat 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' " \
              f"outputformat 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' " \
              f"location '{table_path}' ",
        print_output=False
    )

    sh.hdfs('dfs', '-chmod', '-R', '777', table_path)
    print(f'{tab_name} created, files based in {table_path}')
Пример #10
0
    def __init__(self,
                 spark: pyspark.sql.session.SparkSession,
                 hdfs_path: str,
                 scores_folder: str,
                 score_date: str,
                 models_path: str,
                 models_npv_path: str,
                 numeric_id_name: str,
                 skip_ex_scores: bool,
                 tab_for_predict_enc: str,
                 tab_for_predict_raw: str,
                 periods: str,
                 print_func: Any,
                 ch_file_ex_func: Any,
                 ch_parq_table_func: Any,
                 needed_bps_npv: list,
                 bucket_size: int,
                 periods_file='periods.xlsx',
                 needed_couples=False):
        self.spark = spark
        self.sc = self.spark.sparkContext
        self.hdfs_path = hdfs_path
        self.scores_folder = scores_folder
        self.score_date = score_date
        self.models_path = models_path
        self.models_npv_path = models_npv_path
        self.numeric_id_name = numeric_id_name
        self.skip_ex_scores = skip_ex_scores
        self.tab_for_predict_enc = tab_for_predict_enc
        self.periods = periods
        self.print_func = print_func
        self.problem_models, self.scored_models = [], []
        self.df = self.spark.read.parquet(tab_for_predict_enc)
        self.df_raw = self.spark.read.parquet(tab_for_predict_raw)
        self.dm_length = self.df.count()
        self.periods_file = periods_file
        self.used_features = []
        self.needed_bps_npv = needed_bps_npv
        self.bucket_size = bucket_size
        self.ch_parq_table_func = ch_parq_table_func
        self.ch_file_ex_func = ch_file_ex_func
        self.needed_couples = needed_couples

        self.sc.setLogLevel('FATAL')

        if not self.periods:
            self.periods = pd.read_excel(periods_file)
            self.periods['model'] = self.periods.apply(
                lambda x: f"{x['bp']}_{x['sp']}", axis=1)
            self.periods = self.periods[['model', 'T_min', 'T_max'
                                         ]].set_index('model').T.to_dict()
            self.periods_setted = None
        else:
            self.periods_setted = {'T_min': periods[0], 'T_max': periods[-1]}

        if not needed_couples:
            try:
                config_file = f'{self.hdfs_path}/score_config/cltv_resp_product_pairs_to_score.csv'
                df = self.spark.read.option('delimiter', ';').csv(
                    config_file, header=True).toPandas().fillna(-1).set_index(
                        'BP').astype('int')
                self.needed_couples = []
                for col in df.columns[1:]:
                    self.needed_couples.extend([
                        (bp, col, priority) for bp, priority in df[
                            df[col] > 0][['priority']].reset_index().values
                        if priority > 0
                    ])
                self.needed_couples = pd.DataFrame(self.needed_couples, columns=['bp', 'sp', 'p'])\
                    .sort_values(['p', 'bp', 'sp'])[['bp', 'sp']].values.tolist()
                self.needed_couples = list(map(tuple, self.needed_couples))
            except Exception as E:
                self.print_func(
                    f'problem with reading config {config_file}: {E}')
                sys.exit()
        self.needed_bps_npv.sort()

        self.score_f = '''@F.pandas_udf(T.ArrayType(T.DoubleType()))
def uplifts_{bp}_{sp}(*cols):
    if not os.path.exists('dspl'):
        import zipfile
        with zipfile.ZipFile(f'dspl.zip', 'r') as z:
            z.extractall('dspl')
    with open('{model_file_f}', 'rb') as f:
        model = pickle.load(f)
    df = pd.concat(cols, axis = 1).astype('float32')
    del cols
    gc.collect()
    l = df.shape[0]
    df.columns = {used_features}
    if str(model).startswith('CBClassifier'):
        cut_ct_ft = list(set(model.categorical_features)&set(df.columns))
        if len(cut_ct_ft) > 0:
            df[cut_ct_ft] = df[cut_ct_ft].astype('int64')
    output = pd.DataFrame()
    nans = np.empty(l)
    nans[:] = np.nan
    for t in range(0, 13):
        if ('{bp}'=='{sp}' or ('{bp}'=="kp" and '{sp}'=="pl"))  and t==0:
            output[str(t)] = nans
        elif t in range({start_t}, {end_t}):
            df['T'] = t
            upl = []
            for prchs in range(2):
                df['{bp}'] = prchs
                upl.append(np.round(model.transform(df).astype('float32'),5))
            output[str(t)] = upl[1] - upl[0]
        else:
            output[str(t)] = nans
    return pd.Series(output[[str(x) for x in
        sorted([int(x) for x in output.columns.tolist()])
        ]].values.tolist())'''

        self.score_npv_f = '''@F.pandas_udf(T.DoubleType())
def score_npv_{model_name}(*cols):
    if not os.path.exists('dspl'):
        import zipfile
        with zipfile.ZipFile(f'dspl.zip', 'r') as z:
            z.extractall('dspl')
    with open('{model_file_f}', 'rb') as f:
        model = pickle.load(f)
    df = pd.concat(cols, axis = 1)
    del cols
    gc.collect()
    df.columns = {used_features}
    if '{encoder_file}' != 'False':
        with open('{encoder_file}', 'rb') as f:
            encoder = pickle.load(f)
        df = encoder.transform(df)
        gc.collect()
    return pd.Series(model.transform(df.astype('float32'))).round(2)'''

        if not self.ch_file_ex_func(f'{hdfs_path}{self.scores_folder}'):
            sh.hdfs('dfs', '-mkdir', f'{hdfs_path}{self.scores_folder}')
            sh.hdfs('dfs', '-chmod', '-R', '777',
                    f'{hdfs_path}{self.scores_folder}')

        self.dm_length = self.df.count()
        self.print_func(f'dm_length: {self.dm_length}')
Пример #11
0
    def score_couple(self, bp, sp):

        if (self.skip_ex_scores) & (f'{bp}_{sp}' in self.scored_models):
            return

        self.print_func(f'Starting to score {bp}_{sp}')
        hadoop_path = f'{self.hdfs_path}{self.scores_folder}'
        score_tab_name = f'{hadoop_path}/uplifts_{bp}_{sp}_{self.score_date}'
        aggr_scores_name = f'{hadoop_path}/uplifts_{bp}_{self.score_date}'
        aggr_scores_step_names = [
            f'{hadoop_path}/step_uplifts_{bp}_{self.score_date}_{i}'
            for i in range(6)
        ]

        if (self.skip_ex_scores) & (self.ch_parq_table_func(score_tab_name)):
            self.print_func(
                f'score {bp}_{sp} already exists: {score_tab_name} or added in {aggr_scores_name}'
            )
            self.spark.catalog.clearCache()
            return
        else:
            if self.ch_parq_table_func(aggr_scores_name):
                self.scored_models.extend([
                    x for x in self.spark.read.parquet(
                        aggr_scores_name).columns[1:-1]
                    if x not in self.scored_models
                ])
                if f'{bp}_{sp}' in self.scored_models:
                    self.print_func(
                        f'scores {",".join([x for x in self.scored_models if x.startswith(bp+"_")])} '\
                        f'already added in {aggr_scores_name}'
                    )
                    self.spark.catalog.clearCache()
                    return
            rdy_couples = []
            for t in aggr_scores_step_names:
                if self.ch_parq_table_func(t):
                    rdy_couples.extend([
                        x for x in self.spark.read.parquet(t).columns[1:-1]
                        if x not in self.scored_models
                    ])
                    self.scored_models.extend(rdy_couples)
                else:
                    break
                if len(rdy_couples) != 0:
                    if f'{bp}_{sp}' in rdy_couples:
                        self.print_func(
                            f'score {bp}_{sp} already added in {t}')
                        self.spark.catalog.clearCache()
                        return

        if bp != 'dc1':
            model_file = f'{self.models_path}/{bp}_{sp}.pkl'  # SL
            model_file_f = f'{bp}_{sp}.pkl'
            score_f = self.get_score_func(model_file)
            new_col_name = f'{bp}_{sp}'
            df_to_save = eval(f'''self.df.select(
                    "{self.numeric_id_name}",
                    uplifts_{bp}_{sp}(*[F.col(x) for x in self.used_features]).alias("{bp}_{sp}")
                ).repartition(int(np.ceil(self.dm_length/self.bucket_size)))'''
                              )

        else:
            model_file_f = f'{bp}_{sp}.pkl'
            uplifts_dc2p = self.get_score_func(
                f'{self.models_path}/dc2p_{sp}.pkl')
            used_features_dc2p = [F.col(x) for x in self.used_features]
            uplifts_dc1 = self.get_score_func(
                f'{self.models_path}/dc1_{sp}.pkl')
            used_features_dc1 = [F.col(x) for x in self.used_features]

            df_to_save = eval(f'''self.df.select(
                    "{self.numeric_id_name}",
                    F.when(F.col("prd_crd_dc_active_qty") > 0, uplifts_dc2p_{sp}(*used_features_dc2p))\
                    .otherwise(uplifts_dc1_{sp}(*used_features_dc1))\
                    .alias("{bp}_{sp}")
                ).repartition(int(np.ceil(self.dm_length/self.bucket_size)))'''
                              )

        self.print_func(f'scoring {bp}_{sp}')

        df_to_save = df_to_save.repartition(
            int(np.ceil(self.dm_length / self.bucket_size / 2.5)))

        df_to_save.write.option(
            'compression', 'none').mode('overwrite').parquet(score_tab_name)
        sh.hdfs('dfs', '-setrep', '-R', '2', score_tab_name)
        sh.hdfs('dfs', '-chmod', '-R', '777', score_tab_name)
        self.print_func(f'scores {bp}_{sp} recorded to {score_tab_name}')
        gc.collect()
        self.spark.catalog.clearCache()
        self.scored_models.append(f'{bp}_{sp}')
        self.print_func(
            f'{round((self.needed_couples.index((bp, sp)) + 1) / len(self.needed_couples) * 100, 2)}' \
            f'% of response models done'
        )
Пример #12
0
    def build_mdm_slice(self):
        if self.score_on_ready_data_mart:
            if not self.ch_parq_table_func(self.tab_for_predict):
                out = f'ERROR: specified dm {self.tab_for_predict} does not exist'
                self.print_func(out)
                raise Exception(out)
        elif not self.ch_parq_table_func(self.tab_for_predict):

            self.print_func(f'building dm {self.tab_for_predict}')

            if self.dm_filter_cond:
                self.dm_filter_cond += ' and '
            else:
                self.dm_filter_cond = ''

            df_dm = self.spark.read.parquet(f'{self.dm_path}').filter(
                f'{self.dm_periods_column_name} in ({self.get_date_partition(max(self.periods_from_dm))})'
            ).select([self.numeric_id_name, *list(set(self.cols_to_avg) | set(self.f_cols))])

            df_1 = df_dm.filter(
                f'{self.dm_filter_cond} {self.dm_periods_column_name} = \'{self.slice_date}\' '
            )

            self.dm_length = df_1.count()
            self.print_func(f'slice length: {self.dm_length}')

            if len(self.cols_to_avg) > 0:
                arrg_features = [
                    df_dm.filter(
                        f'{self.dm_filter_cond} {self.dm_periods_column_name} in ' \
                        f'({self.get_date_partition(period)})'
                    ).select([self.numeric_id_name, *self.cols_to_avg]) \
                        .groupby(self.numeric_id_name) \
                        .agg(*[
                        F.avg(col).alias(f'{col}_{period}a')
                        for col in self.cols_to_avg
                    ])
                    for period in self.periods_from_dm
                ]
            else:
                arrg_features = []
            self.repart_val = int(self.dm_length / self.batch_size)

            repart_dfs = [
                x.repartition(self.repart_val, F.col(self.numeric_id_name)) for x in
                (df_1, *arrg_features)
            ]

            df = repart_dfs.pop(0)

            for another_df in repart_dfs:
                df = df.join(another_df, on=self.numeric_id_name)

            df.repartition(self.repart_val) \
                .write.mode('overwrite') \
                .option('compression', 'none') \
                .parquet(self.tab_for_predict)

            self.spark.catalog.clearCache()
            sh.hdfs('dfs', '-chmod', '-R', '777', self.tab_for_predict)
            sh.hdfs('dfs', '-setrep', '-R', '2', self.tab_for_predict)

        else:
            self.print_func(f'dm {self.dm_name} already exists')

        if self.hive_database_for_dms:
            self.hive_query_func(
                query=f"create database if not exists {self.hive_database_for_dms}; "
                      f"drop table if exists {self.hive_database_for_dms}.{self.dm_name}; "
                      f"create external table {self.hive_database_for_dms}.{self.dm_name} " \
                      f"({','.join([f'{x[0]} {x[1]}' for x in self.spark.read.parquet(self.tab_for_predict).dtypes])}) " \
                      f"row format serde 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' " \
                      f"stored as inputformat 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' " \
                      f"outputformat 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' " \
                      f"location '{self.tab_for_predict}' "
            )
            self.print_func(f'table {self.hive_database_for_dms}.{self.dm_name} created in hive')
Пример #13
0
findspark.init("/opt/spark")
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark import sql

conf = SparkConf().setAppName("SO_project").setMaster("spark://damiani-master-slave-0:7077")
sc = SparkContext(conf = conf)
sqlContext = sql.SQLContext(sc)


# In[7]:


import sh
hdfsdir = '/user/ubuntu/hdfs/dataset'
files = [ line.rsplit(None,1)[-1] for line in sh.hdfs('dfs','-ls',hdfsdir).split('\n') if len(line.rsplit(None,1))][2:]   


# In[8]:


rdd = sc.textFile(files[0])


# In[9]:


rdd.take(5)


# In[10]: