Пример #1
0
    def test_generator(self):
        from ambry_sources.sources import GeneratorSource, SourceSpec
        from ambry_sources import head, tail
        cache_fs = fsopendir(self.setup_temp_dir())

        def gen():

            yield list('abcde')

            for i in range(10):
                yield [i, i + 1, i + 2, i + 3, i + 4]

        f = HDFPartition(cache_fs, 'foobar')

        s = GeneratorSource(SourceSpec('foobar'), gen())

        ri = RowIntuiter().run(head(s, 100), tail(s, 100))
        row_spec = self._row_intuiter_to_dict(ri)
        ti = TypeIntuiter().process_header(ri.headers).run(GeneratorSource(SourceSpec('foobar'), gen()))
        with f.writer as w:
            w.set_row_spec(row_spec, ri.headers)
            w.set_types(ti)

        f.load_rows(GeneratorSource(SourceSpec('foobar'), gen()))

        self.assertEqual(f.headers, list('abcde'))
        rows = []

        for row in f.select():
            rows.append(row.dict)
        self.assertEqual(len(rows), 10)
        self.assertEqual(rows[0], {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4})
        self.assertEqual(rows[-1], {'a': 9, 'b': 10, 'c': 11, 'd': 12, 'e': 13})
Пример #2
0
    def test_fixed(self):
        from ambry_sources import head, tail
        cache_fs = fsopendir(self.setup_temp_dir())
        spec = self.sources['simple_fixed']
        assert spec.has_rowspec is False
        s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

        # prepare HDFPartition.
        f = HDFPartition(cache_fs, spec.name)

        ri = RowIntuiter().run(head(s, 100), tail(s, 100))
        row_spec = self._row_intuiter_to_dict(ri)
        ti = TypeIntuiter().process_header(ri.headers).run(s)
        with f.writer as w:
            w.set_row_spec(row_spec, ri.headers)
            w.set_types(ti)
        f.load_rows(s)
        self.assertEqual(f.headers, ['id', 'uuid', 'int', 'float'])
Пример #3
0
    def test_stats(self):
        """Check that the sources can be loaded and analyzed without exceptions and that the
        guesses for headers and start are as expected"""
        from ambry_sources import head, tail

        cache_fs = fsopendir('temp://')

        source = get_source(self.sources['simple_stats'], cache_fs, callback=lambda x, y: (x, y))

        f = HDFPartition(cache_fs, source.spec.name)

        with f.writer as w:
            ri = RowIntuiter().run(head(source, 100), tail(source, 100))
            row_spec = self._row_intuiter_to_dict(ri)
            ti = TypeIntuiter().process_header(ri.headers).run(source)
            w.set_row_spec(row_spec, ri.headers)
            w.set_types(ti)

        f.load_rows(source, run_stats=True)

        expected = {
            u('str_a'):   (30, None, None, None, 10),
            u('str_b'):   (30, None, None, None, 10),
            u('float_a'): (30, 1.0, 5.5, 10.0, 10),
            u('float_b'): (30, 1.1, 5.5, 9.9, 10),
            u('float_c'): (30, 1.1, 5.5, 9.9, 10),
            u('int_b'):   (30, 1.0, 5.0, 9.0, 10),
            u('int_a'):   (30, 1.0, 5.5, 10.0, 10)}

        with f.reader as r:

            for col in r.columns:
                stats = (col.stat_count, col.min, round(col.mean, 1) if col.mean else None,
                         col.max,
                         col.nuniques)
                for a, b in zip(expected[col.name], stats):
                    self.assertEqual(
                        a, b,
                        'Saved stat ({}) does not match to expected ({}) for {}'.format(a, b, col.name))
Пример #4
0
    def test_load_and_headers(self):
        """ Just checks that all of the sources can be loaded without exceptions. """
        from ambry_sources import head, tail

        cache_fs = fsopendir('temp://')

        source_headers = {
            'mz_with_zip_xl': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'mz_no_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'namesu8': [u('origin_english'), u('name_english'), u('origin_native'), u('name_native')],
            'sf_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'simple': [u('id'), u('uuid'), u('int'), u('float')],
            'csv_no_csv': [u('id'), u('uuid'), u('int'), u('float')],
            'mz_with_zip': [u('id'), u('uuid'), u('int'), u('float')],
            'rpeople': [u('name'), u('size')],
            'rent07': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'simple_fixed': [u('id'), u('uuid'), u('int'), u('float')],
            'altname': [u('id'), u('foo'), u('bar'), u('baz')],
            'rentcsv': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'renttab': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'multiexcel': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')],
            'rent97': [
                u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'),
                u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')]
        }

        for source_name, spec in self.sources.items():
            s = get_source(spec, cache_fs, callback=lambda x, y: (x, y))

            f = HDFPartition(cache_fs, spec.name)
            if f.exists:
                f.remove()

            # FIXME: This is really complicated setup for HDFPartition file. Try to simplify.
            with f.writer as w:
                if spec.has_rowspec:
                    row_spec = self._spec_to_dict(spec)
                    headers = self._get_headers(s, spec)
                    ti = TypeIntuiter().process_header(headers).run(s)
                    w.set_row_spec(row_spec, headers)
                    w.set_types(ti)
                else:
                    ri = RowIntuiter().run(head(s, 20), tail(s, 20), w.n_rows)
                    row_spec = self._row_intuiter_to_dict(ri)
                    ti = TypeIntuiter().process_header(ri.headers).run(s)
                    w.set_row_spec(row_spec, ri.headers)
                    w.set_types(ti)
            f.load_rows(s)

            with f.reader as r:
                if spec.name in source_headers:
                    self.assertEqual(source_headers[spec.name], r.headers)