def test_coll_df_operations(self): from odps.ml.nodes import transform_nodes as tnodes splited = self.df.split(0.75) self.assertEqual(len(splited), 2) self.assertEqual(_df_roles(splited[0]), _df_roles(splited[1])) split_node = adapter_from_df(splited[0])._bind_node self.assertEqual(split_node.code_name, "Split") self.assertEqual(split_node.parameters["fraction"], 0.75) id_appended = self.df.append_id() self.assertEqual( _df_roles(id_appended), dict( category="FEATURE", petal_length="FEATURE", petal_width="FEATURE", sepal_width="FEATURE", sepal_length="FEATURE", append_id="", ), ) append_id_node = adapter_from_df(id_appended)._bind_node self.assertEqual(append_id_node.code_name, "AppendID") self.assertEqual(append_id_node.parameters["IDColName"], "append_id") summary_ep = self.df._create_summary_adapter() summary_node = summary_ep._bind_node self.assertIsInstance(summary_node, tnodes.SummaryNode)
def assertFieldsEqual(self, ds1, ds2, func=repr): if isinstance(ds1, CollectionExpr): ds1 = adapter_from_df(ds1) if isinstance(ds2, CollectionExpr): ds2 = adapter_from_df(ds2) def repr_fields(fields): if isinstance(fields, DFAdapter): fields = fields._fields if len(fields) == 0: return [] if isinstance(fields[0], MLField): return [func(f) for f in fields] else: return fields return self.assertEqual(repr_fields(ds1), repr_fields(ds2))
def mock_action(self, sources, output_desc=1, msg='', action=None): try: from odps.ml import PmmlModel except ImportError: PmmlModel = None if not isinstance(sources, Iterable): sources = [ sources, ] input_types = [ PortType.DATA if isinstance(o, CollectionExpr) else PortType.MODEL for o in sources ] source_objs = [ adapter_from_df(s) if isinstance(s, CollectionExpr) else s for s in sources ] uplinks = [ adapter for adapter in source_objs if isinstance(adapter, DFAdapter) ] if isinstance(output_desc, six.integer_types): output_types = [PortType.DATA for _ in range(output_desc)] else: output_types = [ PortType.DATA if ch == 'd' else PortType.MODEL for ch in output_desc ] merge_node = MockNode(msg, action, input_types, output_types) odps = None for idx, o in enumerate(source_objs): o._link_node(merge_node, 'input%d' % (1 + idx)) odps = o._odps outputs = [] for idx, out_type in enumerate(output_types): if out_type == PortType.DATA or PmmlModel is None: new_df = six.next(s for s in sources if isinstance(s, CollectionExpr)).copy() DFAdapter(odps, merge_node.outputs['output%d' % (1 + idx)], new_df, uplink=uplinks) outputs.append(new_df) else: outputs.append( PmmlModel(odps, port=merge_node.outputs['output%d' % (1 + idx)])) if len(output_types) == 1: return outputs[0] else: return outputs
def test_sample(self): num_sampled = self.df.sample(n=20) adapter = adapter_from_df(num_sampled) self.assertIsInstance(num_sampled, DataFrame) self.assertEqual(adapter._bind_node.code_name, "RandomSample") frac_sampled = self.df.sample(frac=0.5) adapter = adapter_from_df(frac_sampled) self.assertIsInstance(frac_sampled, DataFrame) self.assertEqual(adapter._bind_node.code_name, "RandomSample") weighted_sampled = self.df.sample(frac=0.5, weights=self.df.sepal_length) adapter = adapter_from_df(weighted_sampled) self.assertIsInstance(weighted_sampled, DataFrame) self.assertEqual(adapter._bind_node.code_name, "WeightedSample") self.assertEqual(adapter._bind_node.parameters["probCol"], "sepal_length") stratified_sampled = self.df.sample(frac={"Iris-setosa": 0.5}, strata="category") adapter = adapter_from_df(stratified_sampled) self.assertIsInstance(stratified_sampled, DataFrame) self.assertEqual(adapter._bind_node.code_name, "StratifiedSample")
def testSimpleJoin(self): schema = Schema.from_lists(['name', 'id'], [types.string, types.int64]) table = MockTable(name='pyodps_test_expr_table', schema=schema) expr = CollectionExpr(_source_data=table, _schema=schema) schema1 = Schema.from_lists(['id', 'value'], [types.int64, types.string]) table1 = MockTable(name='pyodps_test_expr_table1', schema=schema1) expr1 = CollectionExpr(_source_data=table1, _schema=schema1) schema2 = Schema.from_lists(['value', 'num'], [types.string, types.float64]) table2 = MockTable(name='pyodps_test_expr_table2', schema=schema2) expr2 = CollectionExpr(_source_data=table2, _schema=schema2) df = expr.join(expr1).join(expr2) adapter = adapter_from_df(df) self.assertEqual(len(adapter._bind_node.inputs), 0) self.assertEqual(len(adapter._bind_node.outputs), 1)
def mock_action(self, sources, output_desc=1, msg='', action=None): try: from odps.ml import PmmlModel except ImportError: PmmlModel = None if not isinstance(sources, Iterable): sources = [sources, ] input_types = [PortType.DATA if isinstance(o, CollectionExpr) else PortType.MODEL for o in sources] source_objs = [adapter_from_df(s) if isinstance(s, CollectionExpr) else s for s in sources] uplinks = [adapter for adapter in source_objs if isinstance(adapter, DFAdapter)] if isinstance(output_desc, six.integer_types): output_types = [PortType.DATA for _ in range(output_desc)] else: output_types = [PortType.DATA if ch == 'd' else PortType.MODEL for ch in output_desc] merge_node = MockNode(msg, action, input_types, output_types) odps = None for idx, o in enumerate(source_objs): o._link_node(merge_node, 'input%d' % (1 + idx)) odps = o._odps outputs = [] for idx, out_type in enumerate(output_types): if out_type == PortType.DATA or PmmlModel is None: schema = df_schema_to_odps_schema(six.next(s for s in sources if isinstance(s, CollectionExpr)).schema) new_df = DataFrame(DFAdapter._build_mock_table('mock_table', schema, self.odps)) DFAdapter(odps, merge_node.outputs['output%d' % (1 + idx)], new_df, uplink=uplinks) outputs.append(new_df) else: outputs.append(PmmlModel(odps, port=merge_node.outputs['output%d' % (1 + idx)])) if len(output_types) == 1: return outputs[0] else: return outputs
def _add_case(self, case): adapter = adapter_from_df(self) adapter._bind_node.cases.append(case) return self
def action(df): call_seq.append('B') adapter = adapter_from_df(df) self.ml_context._run(adapter._bind_node, self.odps) call_seq.append('A')
def _get_bind_port(obj): if isinstance(obj, CollectionExpr): return adapter_from_df(obj)._bind_port else: return obj._bind_port
def _df_key_value(df): return dict((f.name, repr(f.kv_config) if f.kv_config else "") for f in adapter_from_df(df).fields)
def _df_continuity(df): return dict((f.name, f.continuity.name) for f in adapter_from_df(df).fields)
def _df_roles(df): return dict((f.name, ",".join(r.name for r in f.role)) for f in adapter_from_df(df).fields)
def test_operations(self): df1 = self.get_table1_df() df1_ep = adapter_from_df(df1) src_fields1 = copy.deepcopy([f for f in adapter_from_df(df1)._fields]) df2 = self.get_table2_df() df2_ep = adapter_from_df(df2) src_fields2 = copy.deepcopy([f for f in adapter_from_df(df2)._fields]) target = self.mock_action(df1) self.exec_op(BatchRoleOperation(['col11', 'col12'], FieldRole.WEIGHT, True), [df1, ], target) self.assertFieldsEqual(df1, src_fields1) self.assertFieldsEqual(target, [set([FieldRole.FEATURE, FieldRole.WEIGHT]), ] * 2, lambda f: f.role) target = self.mock_action(df1) self.exec_op(ExcludeFieldsOperation(['col12', ]), [df1, ], target) self.assertFieldsEqual(df1, src_fields1) self.assertFieldsEqual(target, [set([FieldRole.FEATURE, ]), set()], lambda f: f.role) target = self.mock_action(df1) self.exec_op(SingletonRoleOperation({'col11': FieldRole.WEIGHT, 'col12': FieldRole.LABEL}), [df1, ], target) self.assertFieldsEqual(df1, src_fields1) self.assertFieldsEqual(target, [set([FieldRole.FEATURE, FieldRole.WEIGHT]), set([FieldRole.FEATURE, FieldRole.LABEL])], lambda f: f.role) target = self.mock_action(df1) self.exec_op(FieldContinuityOperation(dict(col11=True, col12=False, col13=True)), [df1, ], target) self.assertFieldsEqual(df1, src_fields1) self.assertFieldsEqual(target, [FieldContinuity.CONTINUOUS, FieldContinuity.DISCRETE, FieldContinuity.CONTINUOUS], lambda f: f.continuity) self.assertEqual(adapter_from_df(target)._fields[-1].name, 'col13') self.assertEqual(adapter_from_df(target)._fields[-1].type, 'expected') target = self.mock_action(df1) kv_config_vals = [KVConfig(':', ','), KVConfig('_', '+'), KVConfig('*', '%')] kv_config = dict(zip(['col11', 'col12', 'col13'], kv_config_vals)) self.exec_op(FieldKVConfigOperation(kv_config), [df1, ], target) self.assertFieldsEqual(df1, src_fields1) self.assertFieldsEqual(target, kv_config_vals, lambda f: f.kv_config) self.assertEqual(adapter_from_df(target)._fields[-1].name, 'col13') self.assertEqual(adapter_from_df(target)._fields[-1].type, 'expected') target = self.mock_action(df1) self.exec_op(StaticFieldChangeOperation([MLField('col13', 'bigint', FieldRole.FEATURE), MLField('col14', 'bigint', FieldRole.FEATURE)]), [df1, ], target) self.assertFieldsEqual(df1, src_fields1) self.assertFieldsEqual(target, ['col13', 'col14'], lambda f: f.name) target = self.mock_action(df1) self.exec_op(StaticFieldChangeOperation([MLField('col13', 'bigint', FieldRole.FEATURE), MLField('col14', 'bigint', FieldRole.FEATURE)], is_append=True), [df1, ], target) self.assertFieldsEqual(df1, src_fields1) self.assertFieldsEqual(target, ['col11', 'col12', 'col13', 'col14'], lambda f: f.name) def test_generator(params, fields): self.assertDictEqual(params, dict(message='TestMsg')) self.assertFieldsEqual(df1, fields[0]) return 'field1:string:label,field2:bigint' target = self.mock_action(df1, msg='TestMsg') self.exec_op(ProgrammaticFieldChangeOperation( functools.partial(test_generator, adapter_from_df(target)._bind_node.parameters, {0: df1_ep.fields}), is_append=False), [df1, ], target) self.assertFieldsEqual(df1, src_fields1) self.assertFieldsEqual(target, [ MLField('field1', 'string', FieldRole.LABEL, FieldContinuity.DISCRETE), MLField('field2', 'bigint', FieldRole.FEATURE, FieldContinuity.CONTINUOUS), ]) target = self.mock_action(df1, msg='TestMsg') self.exec_op(ProgrammaticFieldChangeOperation( functools.partial(test_generator, adapter_from_df(target)._bind_node.parameters, {0: df1_ep.fields}), is_append=True), [df1, ], target) self.assertFieldsEqual(df1, src_fields1) self.assertFieldsEqual(target, df1_ep.fields + [ MLField('field1', 'string', FieldRole.LABEL, FieldContinuity.DISCRETE), MLField('field2', 'bigint', FieldRole.FEATURE, FieldContinuity.CONTINUOUS), ]) sel_cols = {0: [f.name for f in df1_ep.fields], 1: [df2_ep.fields[0].name, ]} exc_cols = {0: [], 1: [df2_ep.fields[1].name, ]} target = self.mock_action([df1, df2]) self.exec_op(MergeFieldsOperation(False, sel_cols, exc_cols), [df1, df2], target) self.assertFieldsEqual(df1, src_fields1) self.assertFieldsEqual(df2, src_fields2) self.assertFieldsEqual(target, df1_ep.fields + [df2_ep.fields[0], ]) target = self.mock_action([df1, df2]) new_table_names = ['t0_%s' % f.name for f in df1_ep.fields] + ['t1_%s' % df2_ep.fields[0].name, ] self.exec_op(MergeFieldsOperation(True, sel_cols, exc_cols), [df1, df2], target) self.assertFieldsEqual(df1, src_fields1) self.assertFieldsEqual(df2, src_fields2) self.assertFieldsEqual(target, new_table_names, lambda f: f.name) target = self.mock_action(df1) part_def = PartitionSelection('part1=1,part2=2') self.exec_op(SetPartitionOperation(part_def), [df1, ], target) self.assertEqual(df1_ep.partitions, None) self.assertFieldsEqual(df1, src_fields1) self.assertEqual(repr(part_def), repr(adapter_from_df(target).partitions)) self.assertFieldsEqual(target, src_fields1)
def exec_op(op, dfs, target): op.execute([adapter_from_df(f) for f in dfs], adapter_from_df(target))