def process(self, cardo_context: CardoContextBase, cardo_dataframe: CardoDataFrame, gt: CardoDataFrame) -> CardoDataFrame: dataframe = cardo_dataframe.dataframe ground_truth_dataframe = gt.dataframe true_positive_count, all_positives_count = self.__get_intersections( dataframe, ground_truth_dataframe) precision_value = self.__get_precision_value(true_positive_count, all_positives_count) if self.precision_column: cardo_dataframe.dataframe = dataframe.withColumn( self.precision_column, F.lit(precision_value)) source_name = self.source_name if self.source_name else cardo_dataframe.table_name cardo_context.logger.info( f"precision calculation for {source_name} -> " f"matches: {true_positive_count}, intersection: {all_positives_count}, " f"precision: {precision_value}, " f"is friendly: {self.friendly_precision}", extra={ "gt_match": true_positive_count, "id": f"{self.match_column}_for_{self.intersection_column}_{gt.table_name}", "log_type": self.log_type, "count": all_positives_count, "statistic_value": precision_value, "statistic_type": "precision", "table_name": source_name, "base_table": gt.table_name }) return cardo_dataframe
def test_created_with_pandas_returns_rdd_correctly(self): pandas = self.context.spark.createDataFrame( [['aa']], 'column1: string').toPandas() cardo_dataframe = CardoDataFrame(pandas, '6') self.assertIsInstance(cardo_dataframe.rdd, RDD) self.assertItemsEqual(pandas.values[0][0], cardo_dataframe.rdd.collect()[0][0])
def process(self, cardo_context, cardo_dataframe=None): # type: (CardoContextBase) -> CardoDataFrame df = cardo_context.spark.sql(self.query) cardo_context.logger.info( 'finished to read query: {table_name} from Hive MetaStore'.format( table_name=self.query)) return CardoDataFrame(df, self.table)
def process(self, cardo_context, cardo_dataframe=None): # type: (CardoContext) -> CardoDataFrame cardo_context.spark.catalog.refreshByPath(self.path) data = CardoDataFrame(cardo_context.spark.read.load(self.path, self.format, self.schema, **self.options)) cardo_context.logger.info( u'read data from Hdfs from {path} successfully'.format(path=self.path)) return data
def test_created_with_dataframe_returns_pandas_correctly(self): dataset = self.context.spark.createDataFrame([['a']], 'column1: string') cardo_dataframe = CardoDataFrame(dataset, '6') self.assertIsInstance(cardo_dataframe.pandas, pandas.DataFrame) self.assertItemsEqual(dataset.collect()[0][0], cardo_dataframe.pandas.values[0][0])
def test_created_with_dataframe_returns_dataframe_correctly(self): dataset = self.context.spark.createDataFrame([['a']], 'column1: string') cardo_dataframe = CardoDataFrame(dataset, '6') self.assertIsInstance(cardo_dataframe.dataframe, DataFrame) self.assertEqual(dataset.collect(), cardo_dataframe.dataframe.collect())
def process(self, cardo_context, cardo_dataframe=None): # type: (CardoContext) -> CardoDataFrame cardo_context.logger.info("Reading data from: {} index".format(self.resource)) function_keeper = self.override_spark_to_str() df = cardo_context.spark.read.format(ELASTIC_FORMAT).options(**self.options_dict).load() self.return_to_str_to_original_function(function_keeper) cardo_context.logger.info("Read data from: {} index successfully".format(self.resource)) return CardoDataFrame(df)
def test_condition_runtime_test(self): dataset = CardoDataFrame( self.context.spark.createDataFrame([['a'], ['b']], 'col1: string')) acc_test = StepAccumulatorRuntimeTest(lambda x: x == 'a', 'unittest') acc_test.test(self.context, dataset).dataframe.collect() self.assertEqual( 1, sum(map(lambda record: record.pm_value, self.log_handler.records)))
def test_rdd_accumulation_without_rows_without_special_columns(self): dataset = CardoDataFrame( self.context.spark.sparkContext.parallelize(['1'])) acc_test = StepAccumulatorRuntimeTest(lambda x: x, 'unittest') acc_test.test(self.context, dataset).rdd.collect() self.assertEqual( 1, sum(map(lambda record: record.pm_value, self.log_handler.records)))
def test_empty_table(self): # Arrange dataset_no_rows = CardoDataFrame( self.context.spark.createDataFrame([], schema="column: string")) dataset_no_cols = CardoDataFrame( self.context.spark.createDataFrame([[]])) no_rows_test = StepAccumulatorRuntimeTest(lambda x: x, 'unittest') no_cols_test = StepAccumulatorRuntimeTest(lambda x: x, 'unittest') # Act no_rows_test.test(self.context, dataset_no_rows).dataframe.collect() no_cols_test.test(self.context, dataset_no_cols).dataframe.collect() # Assert self.assertEqual( 0, sum(map(lambda record: record.pm_value, self.log_handler.records)))
def process(self, cardo_context, cardo_dataframe=None): # type: (CardoContextBase) -> CardoDataFrame df = cardo_context.spark.table(self.table_name) if self.partitions is not None: df = df.repartition(self.partitions) cardo_context.logger.info( 'finished to read table: {table_name} from Hive MetaStore'.format( table_name=self.table_name)) return CardoDataFrame(df, self.table_name)
def process(self, cardo_context, *cardo_dataframes): pool = ThreadPool(self.parallel) Elasticsearch(self.host).indices.delete(index=self.index, ignore=[400, 404]) pool.map( lambda cardo_dataframe: self.__write_dataframe_to_elastic( cardo_context, cardo_dataframe), cardo_dataframes) unioned = self.__read_from_elastic(cardo_context) return CardoDataFrame(unioned, 'unioned')
def process(self, cardo_context, cardo_dataframe): cardo_context.spark.udf.registerJavaFunction(self.config.udf_name, self.config.location) df = cardo_dataframe.dataframe df = df.withColumn( CONFIG_OUTPUT_COLUMN, F.expr("{udf_name}{udf_params}".format( udf_name=self.udf_name, udf_params=self.generate_udf_parameters()))) df = self.extract_columns_and_drop(df) return CardoDataFrame(df)
def process(self, cardo_context, *cardo_dataframes): # type: (CardoContextBase, [CardoDataFrame]) -> CardoDataFrame assert len(cardo_dataframes) > 0 result = self._rename_columns(cardo_dataframes[0]) for cardo_dataframe in cardo_dataframes[1:]: df = self._rename_columns(cardo_dataframe) result = result.join(df, self.index_col, how='outer') hash_to_df_dict[step_to_hash_dict[self]] = result if cardo_context.spark.conf.get(SAVE_RESULTS_AT_RUNTIME_CONFIG, 'True') != 'True': self._save_all_steps_results(cardo_context) return CardoDataFrame(result)
def test_step_without_unique_tests_should_only_count(self): class Test(IStep): def process(self, cardo_context, cardo_dataframe): return cardo_dataframe dataset = CardoDataFrame( self.context.spark.createDataFrame([['a']], 'col1: string')) result = RuntimeTestableStep(Test()).process(self.context, dataset) result.dataframe.collect() self.assertEqual(1, self.get_pm_summary('count')) self.assertEqual(2, len(self.log_handler.records))
def process(self, cardo_context, cardo_dataframe=None): # type: (CardoContext, None) -> CardoDataFrame sheet = utils.find_sheet(cardo_context, utils.read_file(cardo_context, self.filename), self.sheet_name) table = self.__table_from_sheet(sheet) headers = utils.filter_invisible(sheet.row_values(0)) schema = str([str(utils.clean_invisible(cell)) for cell in headers]).replace("'", "")[1:-1] dataframe = self.__create_dataframe(cardo_context, schema, table) return CardoDataFrame(dataframe, sheet.name)
def process(self, cardo_context, *dataframes): # type: (CardoContextBase,list) -> CardoDataFrame df = union_dataframes(dataframes) df = df.dataframe df = self.get_log_of_grades(df) df = self.combine_sources(df) df = self.convert_grade_back_to_normal(df) if not self.allow_ones: df = self.fix_ones(df) return CardoDataFrame(df)
def send_df_to_qsm(self, cardo_context, splited_dataframe): with JDBCQuerier(cardo_context, self.connection_string, ORACLE_DRIVER) as querier: qsm_queue_config = self._get_queue_config(cardo_context) stage_table_name = self.prepare_stage_table( cardo_context, CardoDataFrame(splited_dataframe), qsm_queue_config, querier) if self.send_stage_to_qsm: with JDBCQuerier(cardo_context, self.qsm_connection_string, ORACLE_DRIVER) as qsm_querier: qsm_querier.execute( self._create_main_query(cardo_context, qsm_queue_config, stage_table_name)) self._log_added_stage_table_to_qsm(cardo_context, stage_table_name)
def process(self, cardo_context, cardo_dataframe=None): # type: (CardoContext) -> CardoDataFrame df = cardo_context.spark.read.jdbc( self.connection_string, self.table_name, column=self.parallel_col, lowerBound=self.lower_bound, upperBound=self.upper_bound, numPartitions=self.num_parallel, properties=self.properties ) cardo_context.logger.info( u'read data with {reader} from table {table_name} using connection string : {connection_string} properties: {properties}'.format( reader=self.__class__.__name__, table_name=self.table_name, connection_string=self.connection_string, properties=self.properties)) return CardoDataFrame(dataframe=df, table_name=self.table_name)
def test_multiple_inputs_and_specific_test_on_specific_table(self): class Test(IStep): def process(self, cardo_context, cardo_dataframe, another_dataframe=None): cardo_dataframe.dataframe = cardo_dataframe.dataframe.union( another_dataframe.dataframe) return cardo_dataframe @IStep.pm_input(cardo_dataframe_index=1) def is_null(self, value): return value is None dataset = CardoDataFrame( self.context.spark.createDataFrame([['a']], 'col1: string')) another_dataset = CardoDataFrame( self.context.spark.createDataFrame([[None]], 'col1: string')) result = RuntimeTestableStep(Test()).process(self.context, dataset, another_dataset) result.dataframe.collect() self.assertEqual(2, self.get_pm_summary('count')) self.assertEqual(1, self.get_pm_summary('is_null')) self.assertEqual(5, len(self.log_handler.records))
def test_unpersist_rdd(self): # Arrange rdd = self.context.spark.sparkContext.parallelize([Row(column1='aa')]) second_rdd = self.context.spark.sparkContext.parallelize( [Row(column1='bb')]) cardo_dataframe = CardoDataFrame(rdd, '') cardo_dataframe.persist() cardo_dataframe.rdd = second_rdd # Act cardo_dataframe.unpersist() # Assert self.assertFalse(rdd.is_cached)
def test_unpersist_df(self): # Arrange df = self.context.spark.createDataFrame([['a']], 'column1: string') second_df = self.context.spark.createDataFrame([['b']], 'column1: string') cardo_dataframe = CardoDataFrame(df, '') cardo_dataframe.persist() cardo_dataframe.dataframe = second_df # Act cardo_dataframe.unpersist() # Assert self.assertFalse(df.is_cached)
def test_step_with_unique_after_test_also_counts(self): class Test(IStep): def process(self, cardo_context, cardo_dataframe): return cardo_dataframe @IStep.pm_output() def is_null(self, value): return value is None dataset = CardoDataFrame( self.context.spark.createDataFrame([['a']], 'col1: string')) result = RuntimeTestableStep(Test()).process(self.context, dataset) result.dataframe.collect() self.assertEqual(1, self.get_pm_summary('count')) self.assertEqual(0, self.get_pm_summary('is_null')) self.assertEqual(3, len(self.log_handler.records))
def process(self, cardo_context, cardo_dataframe=None): hash_column = ROWID if self.check_view and self._check_table_or_view(cardo_context): self.use_partitions = False hash_column = self._get_view_hash_column(cardo_context) query_frame = GET_ROWS_FOR_PARTITION_QUERY.format(table_owner=self.table_owner, table_name=self.table_name, where_clause=self.where_clause, select_clause=self.select_clause, table_divider_clause=TABLE_DIVIDER_CLAUSE.format( row_index=hash_column)) partitions_name_num_rows_list = None if self.use_partitions: subpartitions_list = self._get_table_partitions(cardo_context, get_subpartitions=True) partitions_list = self._get_table_partitions(cardo_context, get_subpartitions=False) partitions_name_num_rows_list = subpartitions_list or partitions_list if subpartitions_list: query_frame = query_frame.format(use_partition=USE_SUBPARTITION) else: query_frame = query_frame.format(use_partition=USE_PARTITION if partitions_list else DONT_USE_PARTITION) else: query_frame = query_frame.format(use_partition=DONT_USE_PARTITION) if partitions_name_num_rows_list: readers = self._create_readers_by_partitions(partitions_name_num_rows_list, query_frame) else: readers = [OracleReader(query_frame.format(num_parallel=self.num_parallel, index=reader_index), self.connection_string, fetchsize=self.fetchsize) for reader_index in range(self.num_parallel)] pool = Pool(min(len(readers) // 10, 100) + 1) prev_log_level = cardo_context.logger.level cardo_context.logger.setLevel('ERROR') dataframes = pool.map(lambda reader: reader.process(cardo_context).dataframe, readers) cardo_context.logger.setLevel(prev_log_level) df_united = OracleParallelReader.merge_reduce(lambda df1, df2: df1.union(df2), dataframes) cardo_context.logger.info( u'read data from Oracle from {table_owner}.{table_name} using OracleParallelReader successfully'.format( table_owner=self.table_owner, table_name=self.table_name)) return CardoDataFrame(df_united, table_name=self.table_name)
def union_dataframes(*dataframes: Union[CardoDataFrame, List[CardoDataFrame], Tuple[CardoDataFrame]]): """ :param dataframes: :return: union of all those dataframes """ if isinstance(dataframes[0], list) or isinstance(dataframes[0], tuple): return union_dataframes(*[ dataframe for many_dataframe in dataframes for dataframe in many_dataframe ]) else: if dataframes[0].payload_type in ['dataframe', 'rdd']: unioned = functools.reduce( lambda df1, df2: CardoDataFrame( df1.dataframe.union( df2.dataframe.select(df1.dataframe.columns))), dataframes) return unioned if dataframes[0].payload_type == 'pandas': unioned = pd.concat(dataframes, axis=0) return unioned
def test_step_with_unique_test_on_specific_column(self): class Test(IStep): def process(self, cardo_context, cardo_dataframe): cardo_dataframe.dataframe = cardo_dataframe.dataframe.where( 'col1 is not null') return cardo_dataframe @IStep.pm_input(['col1']) def is_null_before(self, value): return value is None @IStep.pm_output(['col1']) def is_null_after(self, value): return value is None dataset = CardoDataFrame( self.context.spark.createDataFrame([[1, 'a'], [2, None]], 'num: int, col1: string')) result = RuntimeTestableStep(Test()).process(self.context, dataset) result.dataframe.collect() self.assertEqual(1, self.get_pm_summary('count')) self.assertEqual(1, self.get_pm_summary('is_null_before')) self.assertEqual(0, self.get_pm_summary('is_null_after')) self.assertEqual(5, len(self.log_handler.records))
def test_created_with_rdd_returns_pandas_correctly(self): rdd = self.context.spark.sparkContext.parallelize([Row(column1='a')]) cardo_dataframe = CardoDataFrame(rdd, '6') self.assertIsInstance(cardo_dataframe.pandas, pandas.DataFrame) self.assertItemsEqual(rdd.collect()[0][0], cardo_dataframe.pandas.values[0][0])
def test_created_with_pandas_returns_pandas_correctly(self): pandas_df = self.context.spark.createDataFrame( [['a']], 'column1: string').toPandas() cardo_dataframe = CardoDataFrame(pandas_df) self.assertIsInstance(cardo_dataframe.pandas, pandas.DataFrame) self.assertTrue(pandas_df.equals(cardo_dataframe.pandas))
def test_created_with_rdd_returns_rdd_correctly(self): rdd = self.context.spark.sparkContext.parallelize([Row(column1='a')]) cardo_dataframe = CardoDataFrame(rdd, '6') self.assertIsInstance(cardo_dataframe.rdd, RDD) self.assertItemsEqual(rdd.collect(), cardo_dataframe.rdd.collect())
def process(self, cardo_context: CardoContextBase, cardo_dataframe: CardoDataFrame=None) -> CardoDataFrame: if self.reader: logic_dataframe = self.reader.process(cardo_context, cardo_dataframe) return CardoDataFrame(logic_dataframe, snake_case(self.__class__.__name__)) else: raise NotImplementedError