def testTokenizedSignServerAccount(self): server = SignServer(token=str(uuid.uuid4())) server.accounts[ self.odps.account.access_id] = self.odps.account.secret_access_key try: server.start(('127.0.0.1', 0)) account = SignServerAccount(self.odps.account.access_id, server.server.server_address) odps = ODPS(None, None, self.odps.project, self.odps.endpoint, account=account) self.assertRaises( SignServerError, lambda: odps.delete_table(tn('test_sign_account_table'), if_exists=True)) account = SignServerAccount(self.odps.account.access_id, server.server.server_address, token=server.token) odps = ODPS(None, None, self.odps.project, self.odps.endpoint, account=account) odps.delete_table(tn('test_sign_account_table'), if_exists=True) t = odps.create_table(tn('test_sign_account_table'), 'col string', lifecycle=1) self.assertTrue(odps.exist_table(tn('test_sign_account_table'))) t.drop(async=True) finally: server.stop()
def testSignServerAccount(self): server = SignServer() server.accounts[ self.odps.account.access_id] = self.odps.account.secret_access_key try: server.start(('127.0.0.1', 0)) account = SignServerAccount(self.odps.account.access_id, server.server.server_address) odps = ODPS(None, None, self.odps.project, self.odps.endpoint, account=account) odps.delete_table(tn('test_sign_account_table'), if_exists=True) t = odps.create_table(tn('test_sign_account_table'), 'col string', lifecycle=1) self.assertTrue(odps.exist_table(tn('test_sign_account_table'))) t.drop(async_=True) finally: server.stop()
class ODPSWriter(object): def __init__( self, project, access_id, access_key, endpoint, table, columns=None, column_types=None, options=None, ): """ Constructs a `ODPSWriter` instance. Args: project: Name of the ODPS project. access_id: ODPS user access ID. access_key: ODPS user access key. endpoint: ODPS cluster endpoint. table: ODPS table name. columns: The list of column names in the table, which will be inferred if the table exits. column_types" The list of column types in the table, which will be inferred if the table exits. options: Other options passed to ODPS context. """ super(ODPSWriter, self).__init__() if table.find(".") > 0: project, table = table.split(".") if options is None: options = {} self._project = project self._access_id = access_id self._access_key = access_key self._endpoint = endpoint self._table = table self._columns = columns self._column_types = column_types self._odps_table = None _configure_odps_options(self._endpoint, options) self._odps_client = ODPS(self._access_id, self._access_key, self._project, self._endpoint) def _initialize_table(self): if self._odps_client.exist_table(self._table, self._project): self._odps_table = self._odps_client.get_table( self._table, self._project) else: if self._columns is None or self._column_types is None: raise ValueError("columns and column_types need to be " "specified for non-existing table.") schema = Schema.from_lists(self._columns, self._column_types, ["worker"], ["string"]) self._odps_table = self._odps_client.create_table( self._table, schema) def from_iterator(self, records_iter, worker_index): if self._odps_table is None: self._initialize_table() with self._odps_table.open_writer(partition="worker=" + str(worker_index), create_partition=True) as writer: for records in records_iter: writer.write(records)
''' s = ODPS('', '', '%s' % sys.argv[1], endpoint='http://service.cn.maxcompute.aliyun.com/api') d = ODPS('', '', '%s' % sys.argv[2], endpoint='http://service.cn.maxcompute.aliyun.com/api') print("######################################################################") for table in s.list_tables(): t1 = s.get_table(table.name) if d.exist_table(table.name): t2 = d.get_table(table.name) else: print("表%s 在目标项目%s中不存在 跳过校验" % (table.name, sys.argv[2])) continue if table.schema.partitions: #判断该表是否为分区表 #print 'Table %s is partitioned.' %table.name for partition in table.partitions: #print partition.name with t1.open_reader(partition='%s' % partition.name) as reader: count1 = reader.count #print "表名:%s\t分区:%s\t数据量:%s" %(table.name,partition.name,count1) if t2.exist_partition(partition.name): with t2.open_reader(partition='%s' % partition.name) as reader2:
class ODPSSql(Magics): _odps = None def _set_odps(self): if self._odps is not None: return if options.access_id is not None and \ options.access_key is not None and \ options.default_project is not None: self._odps = ODPS( options.access_id, options.access_key, options.default_project, endpoint=options.end_point, tunnel_endpoint=options.tunnel_endpoint ) else: self._odps = enter().odps @line_magic('enter') def enter(self, line): room = line.strip() if room: r = enter(room) self._odps = r.odps else: r = enter() self._odps = r.odps if 'o' not in self.shell.user_ns: self.shell.user_ns['o'] = self._odps return r @line_magic('setup') def setup(self, line): args = line.strip().split() name, args = args[0], args[1:] setup(*args, room=name) html_notify('setup succeeded') @line_magic('teardown') def teardown(self, line): name = line.strip() teardown(name) html_notify('teardown succeeded') @line_magic('list_rooms') def list_rooms(self, line): return list_rooms() @line_magic('stores') def list_stores(self, line): line = line.strip() if line: room = enter(line) else: room = enter() return room.display() def _get_task_percent(self, instance, task_name): progress = instance.get_task_progress(task_name) if len(progress.stages) > 0: all_percent = sum((float(stage.terminated_workers) / stage.total_workers) for stage in progress.stages if stage.total_workers > 0) return all_percent / len(progress.stages) else: return 0 def _to_stdout(cls, msg): print(msg) @line_magic('set') def set_hint(self, line): if '=' not in line: raise ValueError('Hint for sql is not allowed') key, val = line.strip().strip(';').split('=', 1) key, val = key.strip(), val.strip() settings = options.sql.settings if settings is None: options.sql.settings = {key: val} else: options.sql.settings[key] = val @line_cell_magic('sql') def execute(self, line, cell=''): self._set_odps() content = line + '\n' + cell content = content.strip() sql = None hints = dict() splits = content.split(';') for s in splits: stripped = s.strip() if stripped.lower().startswith('set '): hint = stripped.split(' ', 1)[1] k, v = hint.split('=', 1) k, v = k.strip(), v.strip() hints[k] = v elif len(stripped) == 0: continue else: if sql is None: sql = s else: sql = '%s;%s' % (sql, s) # replace user defined parameters sql = replace_sql_parameters(sql, self.shell.user_ns) if sql: bar = init_progress_bar() instance = self._odps.run_sql(sql, hints=hints) if options.verbose: stdout = options.verbose_log or self._to_stdout stdout('Instance ID: ' + instance.id) stdout(' Log view: ' + instance.get_logview_address()) percent = 0 while not instance.is_terminated(): task_names = instance.get_task_names() last_percent = percent if len(task_names) > 0: percent = sum(self._get_task_percent(instance, name) for name in task_names) / len(task_names) else: percent = 0 percent = min(1, max(percent, last_percent)) bar.update(percent) time.sleep(1) instance.wait_for_success() bar.update(1) try: with instance.open_reader() as reader: try: import pandas as pd from pandas.parser import CParserError try: res = pd.read_csv(StringIO(reader.raw)) except (ValueError, CParserError): res = reader.raw except ImportError: try: res = ResultFrame(list(reader), columns=reader._columns) except TypeError: res = reader.raw html_notify('SQL execution succeeded') return res finally: bar.close() @line_magic('persist') def persist(self, line): try: import pandas as pd has_pandas = True except ImportError: has_pandas = False self._set_odps() line = line.strip().strip(';') frame_name, table_name = line.split(None, 1) if '.' in table_name: project_name, table_name = tuple(table_name.split('.', 1)) else: project_name = None frame = self.shell.user_ns[frame_name] if self._odps.exist_table(table_name, project=project_name): raise TypeError('%s already exists' % table_name) if isinstance(frame, DataFrame): frame.persist(name=table_name, project=project_name, notify=False) elif has_pandas and isinstance(frame, pd.DataFrame): frame = DataFrame(frame) frame.persist(name=table_name, project=project_name, notify=False) html_notify('Persist succeeded')
class ODPSSql(Magics): _odps = None def _set_odps(self): if self._odps is not None: return if options.access_id is not None and options.access_key is not None and options.default_project is not None: self._odps = ODPS( options.access_id, options.access_key, options.default_project, endpoint=options.end_point, tunnel_endpoint=options.tunnel_endpoint, ) else: self._odps = enter().odps @line_magic("enter") def enter(self, line): room = line.strip() if room: r = enter(room) self._odps = r.odps else: r = enter() self._odps = r.odps return r @line_magic("setup") def setup(self, line): args = line.strip().split() name, args = args[0], args[1:] setup(*args, room=name) @line_magic("teardown") def teardown(self, line): name = line.strip() teardown(name) @line_magic("list_rooms") def list_rooms(self, line): return list_rooms() @line_magic("stores") def list_stores(self, line): line = line.strip() if line: room = enter(line) else: room = enter() return room.display() def _get_task_percent(self, instance, task_name): progress = instance.get_task_progress(task_name) if len(progress.stages) > 0: all_percent = sum( (float(stage.terminated_workers) / stage.total_workers) for stage in progress.stages if stage.total_workers > 0 ) return all_percent / len(progress.stages) else: return 0 @line_cell_magic("sql") def execute(self, line, cell=""): self._set_odps() sql = line + "\n" + cell sql = sql.strip() if sql: bar = init_progress_bar() instance = self._odps.run_sql(sql) percent = 0 while not instance.is_terminated(): task_names = instance.get_task_names() last_percent = percent if len(task_names) > 0: percent = sum(self._get_task_percent(instance, name) for name in task_names) / len(task_names) else: percent = 0 percent = min(1, max(percent, last_percent)) bar.update(percent) time.sleep(1) instance.wait_for_success() bar.update(1) try: with instance.open_reader() as reader: try: import pandas as pd try: return pd.read_csv(StringIO(reader.raw)) except ValueError: return reader.raw except ImportError: return ResultFrame(list(reader), columns=reader._columns) finally: bar.close() @line_magic("persist") def persist(self, line): import pandas as pd self._set_odps() line = line.strip().strip(";") frame_name, table_name = line.split(None, 1) if "." in table_name: project_name, table_name = tuple(table_name.split(".", 1)) else: project_name = None frame = self.shell.user_ns[frame_name] if not isinstance(frame, pd.DataFrame): raise TypeError("%s is not a Pandas DataFrame" % frame_name) columns = list(frame.columns) types = [np_to_odps_types.get(tp, odps_types.string) for tp in frame.dtypes] if self._odps.exist_table(table_name, project=project_name): raise TypeError("%s already exists") tb = self._odps.create_table(table_name, Schema.from_lists(columns, types)) def gen(df): size = len(df) bar = init_progress_bar(size) try: c = itertools.count() for row in df.values: i = next(c) if i % 50 == 0: bar.update(min(i, size)) yield tb.new_record(list(row)) bar.update(size) finally: bar.close() with tb.open_writer() as writer: writer.write(gen(frame))
class ODPSSql(Magics): _odps = None def _set_odps(self): if self._odps is not None: return if options.access_id is not None and \ options.access_key is not None and \ options.default_project is not None: self._odps = ODPS( options.access_id, options.access_key, options.default_project, endpoint=options.end_point, tunnel_endpoint=options.tunnel_endpoint ) else: self._odps = enter().odps @line_magic('enter') def enter(self, line): room = line.strip() if room: r = enter(room) self._odps = r.odps else: r = enter() self._odps = r.odps if 'o' not in self.shell.user_ns: self.shell.user_ns['o'] = self._odps return r @line_magic('setup') def setup(self, line): args = line.strip().split() name, args = args[0], args[1:] setup(*args, room=name) @line_magic('teardown') def teardown(self, line): name = line.strip() teardown(name) @line_magic('list_rooms') def list_rooms(self, line): return list_rooms() @line_magic('stores') def list_stores(self, line): line = line.strip() if line: room = enter(line) else: room = enter() return room.display() def _get_task_percent(self, instance, task_name): progress = instance.get_task_progress(task_name) if len(progress.stages) > 0: all_percent = sum((float(stage.terminated_workers) / stage.total_workers) for stage in progress.stages if stage.total_workers > 0) return all_percent / len(progress.stages) else: return 0 def _to_stdout(cls, msg): print(msg) @line_magic('set') def set_hint(self, line): if '=' not in line: raise ValueError('Hint for sql is not allowed') key, val = line.strip().strip(';').split('=', 1) key, val = key.strip(), val.strip() settings = options.sql.settings if settings is None: options.sql.settings = {key: val} else: options.sql.settings[key] = val @line_cell_magic('sql') def execute(self, line, cell=''): self._set_odps() content = line + '\n' + cell content = content.strip() sql = None hints = dict() splits = content.split(';') for s in splits: stripped = s.strip() if stripped.lower().startswith('set '): hint = stripped.split(' ', 1)[1] k, v = hint.split('=', 1) k, v = k.strip(), v.strip() hints[k] = v elif len(stripped) == 0: continue else: if sql is None: sql = s else: sql = '%s;%s' % (sql, s) # replace user defined parameters sql = replace_sql_parameters(sql, self.shell.user_ns) if sql: bar = init_progress_bar() instance = self._odps.run_sql(sql, hints=hints) if options.verbose: stdout = options.verbose_log or self._to_stdout stdout('Instance ID: ' + instance.id) stdout(' Log view: ' + instance.get_logview_address()) percent = 0 while not instance.is_terminated(): task_names = instance.get_task_names() last_percent = percent if len(task_names) > 0: percent = sum(self._get_task_percent(instance, name) for name in task_names) / len(task_names) else: percent = 0 percent = min(1, max(percent, last_percent)) bar.update(percent) time.sleep(1) instance.wait_for_success() bar.update(1) try: with instance.open_reader() as reader: try: import pandas as pd from pandas.parser import CParserError try: return pd.read_csv(StringIO(reader.raw)) except (ValueError, CParserError): return reader.raw except ImportError: try: return ResultFrame(list(reader), columns=reader._columns) except TypeError: return reader.raw finally: bar.close() @line_magic('persist') def persist(self, line): try: import pandas as pd has_pandas = True except ImportError: has_pandas = False self._set_odps() line = line.strip().strip(';') frame_name, table_name = line.split(None, 1) if '.' in table_name: project_name, table_name = tuple(table_name.split('.', 1)) else: project_name = None frame = self.shell.user_ns[frame_name] if self._odps.exist_table(table_name, project=project_name): raise TypeError('%s already exists' % table_name) if isinstance(frame, DataFrame): frame.persist(name=table_name, project=project_name) elif has_pandas and isinstance(frame, pd.DataFrame): frame = DataFrame(frame) frame.persist(name=table_name, project=project_name)