def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.df_schema = schema self.schema = df_schema_to_odps_schema(schema) self.df = None self.expr = None self.engine = SQLAlchemyEngine() import sqlalchemy from sqlalchemy import create_engine self.sql_engine = engine = create_engine('postgres://localhost/pyodps') # self.sql_engine = engine = create_engine('mysql://localhost/pyodps') # self.sql_engine = engine = create_engine('sqlite://') self.conn = engine.connect() self.metadata = metadata = sqlalchemy.MetaData(bind=engine) columns = df_schema_to_sqlalchemy_columns(self.df_schema, engine=self.sql_engine) t = sqlalchemy.Table('pyodps_test_data', metadata, *columns) metadata.create_all() self.table = t self.expr = CollectionExpr(_source_data=self.table, _schema=self.df_schema) class FakeBar(object): def update(self, *args, **kwargs): pass self.faked_bar = FakeBar()
def _random_values(self): values = [self._gen_random_string() if random.random() >= 0.05 else None, self._gen_random_bigint(), self._gen_random_double() if random.random() >= 0.05 else None, self._gen_random_datetime() if random.random() >= 0.05 else None] schema = df_schema_to_odps_schema(self.schema) return Record(schema=schema, values=values)
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'birth', 'scale'][:5], datatypes('string', 'int64', 'float64', 'boolean', 'datetime', 'decimal')[:5]) self.schema = df_schema_to_odps_schema(schema) table_name = tn('pyodps_test_%s' % str(uuid.uuid4()).replace('-', '_')) self.odps.delete_table(table_name, if_exists=True) self.table = self.odps.create_table(name=table_name, schema=self.schema) self.expr = CollectionExpr(_source_data=self.table, _schema=schema) self.engine = SeahawksEngine(self.odps) class FakeBar(object): def update(self, *args, **kwargs): pass def inc(self, *args, **kwargs): pass def status(self, *args, **kwargs): pass self.faked_bar = FakeBar()
def _random_values(self): values = [ self._gen_random_str(), self._gen_random_int64(), self._gen_random_float64() ] schema = df_schema_to_odps_schema(self.schema) return Record(schema=schema, values=values)
def testExistingPersist(self): self.create_iris(IRIS_TABLE) df = DataFrame(self.odps.get_table(IRIS_TABLE)).append_id() odps_schema = df_schema_to_odps_schema(df.schema) cols = list(reversed(odps_schema.columns)) odps_schema = Schema.from_lists([c.name for c in cols], [c.type for c in cols]) self.odps.delete_table(EXISTING_PERSIST_TABLE, if_exists=True) self.odps.create_table(EXISTING_PERSIST_TABLE, odps_schema) df.persist(EXISTING_PERSIST_TABLE)
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'category', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.schema = df_schema_to_odps_schema(schema) import pandas as pd self.data = self._gen_data(20, value_range=(-1000, 1000)) self.df = pd.DataFrame(self.data, columns=schema.names) self.expr = DataFrame(self.df, schema=schema)
def testLargeColumnsFormatter(self): names = list(itertools.chain(*[[name + str(i) for name in self.schema.names] for i in range(10)])) types = self.schema.types * 10 schema = Schema.from_lists(names, types) gen_row = lambda: list(itertools.chain(*(self._random_values().values for _ in range(10)))) data = [Record(schema=df_schema_to_odps_schema(schema), values=gen_row()) for _ in range(10)] pd = ResultFrame(data=data, schema=schema, pandas=True) result = ResultFrame(data=data, schema=schema, pandas=False) self.assertEqual(to_str(repr(pd)), to_str(repr(result))) self.assertEqual(to_str(pd._repr_html_()), to_str(result._repr_html_()))
def testStaticPartition(self): self.create_iris(IRIS_TABLE) df = DataFrame(self.odps.get_table(IRIS_TABLE)) id_df = df.append_id() src_schema = df_schema_to_odps_schema(id_df.schema) schema = Schema(columns=src_schema.simple_columns, partitions=[Partition(name='ds', type=odps_types.string)]) self.odps.delete_table(STATIC_PART_TABLE, if_exists=True) dest_table = self.odps.create_table(STATIC_PART_TABLE, schema, lifecycle=1) id_df.persist(STATIC_PART_TABLE, partition='ds=20170314', lifecycle=1) self.assertTrue(dest_table.exist_partition('ds=20170314'))
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.schema = df_schema_to_odps_schema(schema) table_name = tn('pyodps_test_selecter_table_%s' % str(uuid.uuid4()).replace('-', '_')) self.odps.delete_table(table_name, if_exists=True) self.table = self.odps.create_table(name=table_name, schema=self.schema) self.expr = CollectionExpr(_source_data=self.table, _schema=schema) class FakeBar(object): def update(self, *args, **kwargs): pass def inc(self, *args, **kwargs): pass def status(self, *args, **kwargs): pass self.faked_bar = FakeBar() data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] schema2 = Schema.from_lists(['name', 'id2', 'id3'], [types.string, types.bigint, types.bigint]) table_name = tn('pyodps_test_selecter_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=schema2) self.expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) self._gen_data(data=data) data2 = [['name1', 4, -1], ['name2', 1, -2]] self.odps.write_table(table2, 0, data2) self.selecter = EngineSelecter()
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.schema = df_schema_to_odps_schema(schema) table_name = 'pyodps_test_engine_table' self.odps.delete_table(table_name, if_exists=True) self.table = self.odps.create_table( name='pyodps_test_engine_table', schema=self.schema) self.expr = CollectionExpr(_source_data=self.table, _schema=schema) self.engine = ODPSEngine(self.odps) class FakeBar(object): def update(self, *args, **kwargs): pass self.faked_bar = FakeBar()
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.schema = df_schema_to_odps_schema(schema) import pandas as pd self.df = pd.DataFrame(None, columns=schema.names) self.expr = CollectionExpr(_source_data=self.df, _schema=schema) self.engine = PandasEngine(self.odps) self.odps_engine = ODPSEngine(self.odps) class FakeBar(object): def update(self, *args, **kwargs): pass self.faked_bar = FakeBar()
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.schema = df_schema_to_odps_schema(schema) import pandas as pd self.df = pd.DataFrame(None, columns=schema.names) self.expr = CollectionExpr(_source_data=self.df, _schema=schema) self.engine = PandasEngine(self.odps) self.odps_engine = ODPSEngine(self.odps) class FakeBar(object): def update(self, *args, **kwargs): pass self.faked_bar = FakeBar()
def mock_action(self, sources, output_desc=1, msg='', action=None): try: from odps.ml import PmmlModel except ImportError: PmmlModel = None if not isinstance(sources, Iterable): sources = [sources, ] input_types = [PortType.DATA if isinstance(o, CollectionExpr) else PortType.MODEL for o in sources] source_objs = [adapter_from_df(s) if isinstance(s, CollectionExpr) else s for s in sources] uplinks = [adapter for adapter in source_objs if isinstance(adapter, DFAdapter)] if isinstance(output_desc, six.integer_types): output_types = [PortType.DATA for _ in range(output_desc)] else: output_types = [PortType.DATA if ch == 'd' else PortType.MODEL for ch in output_desc] merge_node = MockNode(msg, action, input_types, output_types) odps = None for idx, o in enumerate(source_objs): o._link_node(merge_node, 'input%d' % (1 + idx)) odps = o._odps outputs = [] for idx, out_type in enumerate(output_types): if out_type == PortType.DATA or PmmlModel is None: schema = df_schema_to_odps_schema(six.next(s for s in sources if isinstance(s, CollectionExpr)).schema) new_df = DataFrame(DFAdapter._build_mock_table('mock_table', schema, self.odps)) DFAdapter(odps, merge_node.outputs['output%d' % (1 + idx)], new_df, uplink=uplinks) outputs.append(new_df) else: outputs.append(PmmlModel(odps, port=merge_node.outputs['output%d' % (1 + idx)])) if len(output_types) == 1: return outputs[0] else: return outputs
def _random_values(self): values = [self._gen_random_str(), self._gen_random_int64(), self._gen_random_float64()] schema = df_schema_to_odps_schema(self.schema) return Record(schema=schema, values=values)