def setUp(self): raw1 = '\n'.join(['date,city,temp,description,air condition', '12-01-2018,New York,55,rain,3', '12-02-2018,Los Angeles,75,sunny,2', '12-05-2018,Chicago,41,wind,3', '12-06-2018,Chicago,42,cloudy,2', '12-07-2018,Chicago,43,snow,3', '12-08-2018,Los Angeles,72,moggy,2']) self.df1 = pd.read_csv(StringIO(raw1)) raw2 = '\n'.join(['month,day,year,city,wind', '12,01,2018,new york,37.5', '12,02,2018,los angeles,22.1', '12,05,2018,chicago,58.8']) self.df2 = pd.read_csv(StringIO(raw2)) dsbox_profiler = DSboxProfiler() self.meta1 = dsbox_profiler.profile(inputs=self.df1, metadata={ 'variables': [ {'semantic_type': ['http://schema.org/Date']}, {}, {}, {}, {}]}) self.meta2 = dsbox_profiler.profile(inputs=self.df2, metadata={ 'variables': [ {'semantic_type': ['http://schema.org/Month']}, {'semantic_type': ['http://schema.org/Day']}, {'semantic_type': ['http://schema.org/Year']}, {}, {}]}) self.args = { 'left_df': self.df1, 'right_df': self.df2, 'left_metadata': self.meta1, 'right_metadata': self.meta2 } self.rltk_joiner = RLTKJoiner()
def calculate_dsbox_features(data: pd.DataFrame, metadata: typing.Union[dict, None]) -> dict: """Calculate dsbox features, add to metadata dictionary Args: data: dataset as a pandas dataframe metadata: metadata dict Returns: updated metadata dict """ from datamart.profilers.dsbox_profiler import DSboxProfiler if not metadata: return metadata return DSboxProfiler().profile(inputs=data, metadata=metadata)
def __init__(self): self.basic_profiler = BasicProfiler() self.dsbox_profiler = DSboxProfiler()
def __init__(self): self.basic_profiler = BasicProfiler() self.dsbox_profiler = DSboxProfiler() self.two_ravens_profiler = TwoRavensProfiler()
def test_dsbox_profiler(self): self.fake_matadata = {"variables": []} for i in range(self.df.shape[1]): self.fake_matadata["variables"].append({}) dsbox_profiler = DSboxProfiler() metadata = dsbox_profiler.profile(inputs=self.df, metadata=self.fake_matadata) expected = { 'variables': [{ 'dsbox_profiled': { 'ratio_of_numeric_values': 1.0, 'number_of_outlier_numeric_values': 0 } }, { 'dsbox_profiled': { 'ratio_of_numeric_values': 0.25, 'number_std': 0, 'number_of_outlier_numeric_values': 0, 'most_common_tokens': [{ 'name': '2014-02-23', 'count': 1 }, { 'name': '2018-10-05', 'count': 1 }, { 'name': '2020-09-23T00:10:00', 'count': 1 }, { 'name': '2023213', 'count': 1 }], 'number_of_tokens_containing_numeric_char': 4, 'ratio_of_tokens_containing_numeric_char': 1.0, 'number_of_values_containing_numeric_char': 4, 'ratio_of_values_containing_numeric_char': 1.0 } }, { 'dsbox_profiled': { 'most_common_tokens': [{ 'name': 'Jack', 'count': 1 }, { 'name': 'Ricky', 'count': 1 }, { 'name': 'Steve', 'count': 1 }, { 'name': 'Tom', 'count': 1 }] } }] } self.assertEqual(metadata, expected)