def setUp(self): raw1 = '\n'.join(['date,city,temp,description,air condition', '12-01-2018,New York,55,rain,3', '12-02-2018,Los Angeles,75,sunny,2', '12-05-2018,Chicago,41,wind,3', '12-06-2018,Chicago,42,cloudy,2', '12-07-2018,Chicago,43,snow,3', '12-08-2018,Los Angeles,72,moggy,2']) self.df1 = pd.read_csv(StringIO(raw1)) raw2 = '\n'.join(['month,day,year,city,wind', '12,01,2018,new york,37.5', '12,02,2018,los angeles,22.1', '12,05,2018,chicago,58.8']) self.df2 = pd.read_csv(StringIO(raw2)) dsbox_profiler = DSboxProfiler() self.meta1 = dsbox_profiler.profile(inputs=self.df1, metadata={ 'variables': [ {'semantic_type': ['http://schema.org/Date']}, {}, {}, {}, {}]}) self.meta2 = dsbox_profiler.profile(inputs=self.df2, metadata={ 'variables': [ {'semantic_type': ['http://schema.org/Month']}, {'semantic_type': ['http://schema.org/Day']}, {'semantic_type': ['http://schema.org/Year']}, {}, {}]}) self.args = { 'left_df': self.df1, 'right_df': self.df2, 'left_metadata': self.meta1, 'right_metadata': self.meta2 } self.rltk_joiner = RLTKJoiner()
def prepare_joiner( joiner: JoinerType = JoinerType.DEFAULT ) -> typing.Optional[JoinerBase]: """Prepare joiner, lazy evaluation for joiners, should be useful because joiner like RLTK may need many dependency packages. Args: joiner: string of joiner type Returns: joiner instance """ try: JoinerType(joiner) except ValueError: return None if JoinerType(joiner) == JoinerType.RLTK: from datamart.joiners.rltk_joiner import RLTKJoiner return RLTKJoiner() if JoinerType(joiner) == JoinerType.DEFAULT: return DefaultJoiner() if JoinerType(joiner) == JoinerType.EXACT_MATCH: from datamart.joiners.exact_match_joiner import ExactMatchJoiner return ExactMatchJoiner() return None
class TestRLTKJoiner(unittest.TestCase): def setUp(self): raw1 = '\n'.join([ 'date,city,temp,description,air condition', '12-01-2018,New York,55,rain,3', '12-02-2018,Los Angeles,75,sunny,2', '12-05-2018,Chicago,41,wind,3', '12-06-2018,Chicago,42,cloudy,2', '12-07-2018,Chicago,43,snow,3', '12-08-2018,Los Angeles,72,moggy,2' ]) self.df1 = pd.read_csv(StringIO(raw1)) raw2 = '\n'.join([ 'month,day,year,city,wind', '12,01,2018,new york,37.5', '12,02,2018,los angeles,22.1', '12,05,2018,chicago,58.8' ]) self.df2 = pd.read_csv(StringIO(raw2)) dsbox_profiler = DSboxProfiler() self.meta1 = dsbox_profiler.profile(inputs=self.df1, metadata={ 'variables': [{ 'semantic_type': ['http://schema.org/Date'] }, {}, {}, {}, {}] }) self.meta2 = dsbox_profiler.profile( inputs=self.df2, metadata={ 'variables': [{ 'semantic_type': ['http://schema.org/Month'] }, { 'semantic_type': ['http://schema.org/Day'] }, { 'semantic_type': ['http://schema.org/Year'] }, {}, {}] }) self.args = { 'left_df': self.df1, 'right_df': self.df2, 'left_metadata': self.meta1, 'right_metadata': self.meta2 } self.rltk_joiner = RLTKJoiner() @Utils.test_print def test_join_date(self): res = self.rltk_joiner.join(**self.args, left_columns=[[0]], right_columns=[[0, 1, 2]]) expected = '''date,city,temp,description,air condition,city,wind 12-01-2018,New York,55,rain,3,new york,37.5 12-02-2018,Los Angeles,75,sunny,2,los angeles,22.1 12-05-2018,Chicago,41,wind,3,chicago,58.8 12-06-2018,Chicago,42,cloudy,2,chicago,58.8 12-07-2018,Chicago,43,snow,3,chicago,58.8 12-08-2018,Los Angeles,72,moggy,2,chicago,58.8 ''' self.assertEqual(res.to_csv(index=False), expected)