示例#1
0
    def setUp(self):
        raw1 = '\n'.join(['date,city,temp,description,air condition',
                          '12-01-2018,New York,55,rain,3',
                          '12-02-2018,Los Angeles,75,sunny,2',
                          '12-05-2018,Chicago,41,wind,3',
                          '12-06-2018,Chicago,42,cloudy,2',
                          '12-07-2018,Chicago,43,snow,3',
                          '12-08-2018,Los Angeles,72,moggy,2'])
        self.df1 = pd.read_csv(StringIO(raw1))
        raw2 = '\n'.join(['month,day,year,city,wind',
                          '12,01,2018,new york,37.5',
                          '12,02,2018,los angeles,22.1',
                          '12,05,2018,chicago,58.8'])
        self.df2 = pd.read_csv(StringIO(raw2))

        dsbox_profiler = DSboxProfiler()
        self.meta1 = dsbox_profiler.profile(inputs=self.df1, metadata={
            'variables': [
                {'semantic_type': ['http://schema.org/Date']},
                {}, {}, {}, {}]})
        self.meta2 = dsbox_profiler.profile(inputs=self.df2, metadata={
            'variables': [
                {'semantic_type': ['http://schema.org/Month']},
                {'semantic_type': ['http://schema.org/Day']},
                {'semantic_type': ['http://schema.org/Year']},
                {}, {}]})

        self.args = {
            'left_df': self.df1,
            'right_df': self.df2,
            'left_metadata': self.meta1,
            'right_metadata': self.meta2
        }

        self.rltk_joiner = RLTKJoiner()
    def prepare_joiner(
        joiner: JoinerType = JoinerType.DEFAULT
    ) -> typing.Optional[JoinerBase]:
        """Prepare joiner, lazy evaluation for joiners,
        should be useful because joiner like RLTK may need many dependency packages.

        Args:
            joiner: string of joiner type

        Returns:
            joiner instance

        """

        try:
            JoinerType(joiner)
        except ValueError:
            return None

        if JoinerType(joiner) == JoinerType.RLTK:
            from datamart.joiners.rltk_joiner import RLTKJoiner
            return RLTKJoiner()

        if JoinerType(joiner) == JoinerType.DEFAULT:
            return DefaultJoiner()

        if JoinerType(joiner) == JoinerType.EXACT_MATCH:
            from datamart.joiners.exact_match_joiner import ExactMatchJoiner
            return ExactMatchJoiner()

        return None
示例#3
0
class TestRLTKJoiner(unittest.TestCase):
    def setUp(self):
        raw1 = '\n'.join([
            'date,city,temp,description,air condition',
            '12-01-2018,New York,55,rain,3',
            '12-02-2018,Los Angeles,75,sunny,2',
            '12-05-2018,Chicago,41,wind,3', '12-06-2018,Chicago,42,cloudy,2',
            '12-07-2018,Chicago,43,snow,3', '12-08-2018,Los Angeles,72,moggy,2'
        ])
        self.df1 = pd.read_csv(StringIO(raw1))
        raw2 = '\n'.join([
            'month,day,year,city,wind', '12,01,2018,new york,37.5',
            '12,02,2018,los angeles,22.1', '12,05,2018,chicago,58.8'
        ])
        self.df2 = pd.read_csv(StringIO(raw2))

        dsbox_profiler = DSboxProfiler()
        self.meta1 = dsbox_profiler.profile(inputs=self.df1,
                                            metadata={
                                                'variables': [{
                                                    'semantic_type':
                                                    ['http://schema.org/Date']
                                                }, {}, {}, {}, {}]
                                            })
        self.meta2 = dsbox_profiler.profile(
            inputs=self.df2,
            metadata={
                'variables': [{
                    'semantic_type': ['http://schema.org/Month']
                }, {
                    'semantic_type': ['http://schema.org/Day']
                }, {
                    'semantic_type': ['http://schema.org/Year']
                }, {}, {}]
            })

        self.args = {
            'left_df': self.df1,
            'right_df': self.df2,
            'left_metadata': self.meta1,
            'right_metadata': self.meta2
        }

        self.rltk_joiner = RLTKJoiner()

    @Utils.test_print
    def test_join_date(self):
        res = self.rltk_joiner.join(**self.args,
                                    left_columns=[[0]],
                                    right_columns=[[0, 1, 2]])
        expected = '''date,city,temp,description,air condition,city,wind
12-01-2018,New York,55,rain,3,new york,37.5
12-02-2018,Los Angeles,75,sunny,2,los angeles,22.1
12-05-2018,Chicago,41,wind,3,chicago,58.8
12-06-2018,Chicago,42,cloudy,2,chicago,58.8
12-07-2018,Chicago,43,snow,3,chicago,58.8
12-08-2018,Los Angeles,72,moggy,2,chicago,58.8
'''

        self.assertEqual(res.to_csv(index=False), expected)