def download_by_tunnel(table_name, file_path, row_count, pt=None, sep=','): """ 通过dataframe的方式读取odps的表数据 :param table_name: :param file_path: :return: """ tunnel = TableTunnel(odps) if pt is not None: download_session = tunnel.create_download_session(table_name, partition_spec=pt) else: download_session = tunnel.create_download_session(table_name) with open(file_path, 'w') as f: with download_session.open_record_reader( 0, download_session.count) as reader: for record in reader: line = '' for i in range(row_count): if i > 0: line = line + sep line = line + str(record[i]) line = line + '\n' f.writelines(line)
def _execute_arrow_tunnel(cls, ctx, op): from odps import ODPS from odps.tunnel import TableTunnel import pyarrow as pa import pandas as pd project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(odps_params['access_id'], odps_params['secret_access_key'], project=odps_params['project'], endpoint=endpoint) t = o.get_table(op.table_name) tunnel = TableTunnel(o, project=t.project) retry_times = options.retry_times retries = 0 while True: try: if op.partition_spec is not None: upload_session = tunnel.create_upload_session( t.name, partition_spec=op.partition_spec) else: upload_session = tunnel.create_upload_session(t.name) break except: if retries >= retry_times: raise time.sleep(1) logger.debug('Start writing table %s split index: %s', op.table_name, op.inputs[0].index) retries = 0 while True: try: writer = upload_session.open_arrow_writer(0) arrow_rb = pa.RecordBatch.from_pandas(ctx[op.inputs[0].key]) writer.write(arrow_rb) writer.close() break except: if retries >= retry_times: raise time.sleep(1) upload_session.commit([0]) logger.debug('Finish writing table %s split index: %s', op.table_name, op.inputs[0].index) ctx[op.outputs[0].key] = pd.DataFrame()
def download_infos(tablename, storename, keys): o = ODPS("LTAIWt3hG5GvYBhX", "RriedkAIENmPvXvRmQcy9wRqOYx3QV", 'graph_embedding_intern_dev', endpoint='http://service-corp.odps.aliyun-inc.com/api') project = o.get_project() csv_file = open(storename, mode='w') writer = csv.writer(csv_file, delimiter='\t') tunnel = TableTunnel(o) download_session = tunnel.create_download_session(tablename) with download_session.open_record_reader(0, download_session.count) as reader: for record in reader: info = [record[key] for key in keys] writer.writerow(info) print("complete storing {}".format(storename))
def _execute_arrow_tunnel(cls, ctx, op): from odps import ODPS from odps.tunnel import TableTunnel project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(odps_params['access_id'], odps_params['secret_access_key'], project=odps_params['project'], endpoint=endpoint) t = o.get_table(op.table_name) tunnel = TableTunnel(o, project=t.project) if op.partition_spec is not None: download_session = tunnel.create_download_session( t.name, partition_spec=op.partition_spec) else: download_session = tunnel.create_download_session(t.name) logger.debug('Start reading table %s(%s) split from %s to %s', op.table_name, op.partition_spec, op.start_index, op.end_index) if op.nrows is None: count = op.end_index - op.start_index else: count = op.nrows with download_session.open_arrow_reader(op.start_index, count, columns=op.columns) as reader: table = reader.read() table = cls._append_partition_values(table, op) if op.string_as_binary: table = cls._cast_string_to_binary(table) data = arrow_table_to_pandas_dataframe( table, use_arrow_dtype=op.use_arrow_dtype) data = cls._align_columns(data, op.outputs[0].dtypes) logger.debug('Finish reading table %s(%s) split from %s to %s', op.table_name, op.partition_spec, op.start_index, op.end_index) ctx[op.outputs[0].key] = data
def read_table_split(conn, download_id, start, count, idx, partition_spec=None): # read part data from odps.tunnel import TableTunnel tunnel = TableTunnel(client=rest_client, project=project, endpoint=tunnel_endpoint) session = tunnel.create_download_session( table_name, download_id=download_id, partition_spec=partition_spec) data = session.open_record_reader(start, count).to_pandas() conn.send((idx, data))
def uploadexcel(input_file,output_table_n='defult'): odps = ODPS(ACCESS_KEY_ID, ACCESS_KEY_SECRET, PROJECT, endpoint='http://service.odps.aliyun.com/api') project = odps.get_project() # 取到默认项目 print(project) ds = datetime.datetime.now().strftime('%Y%m%d') print(ds) wb = openpyxl.load_workbook(filename=input_file,read_only=True) ws = wb.active print(datetime.datetime.now()) output_table = odps.get_table(output_table_n) if output_table.exist_partition('ds=' + ds): output_table.delete_partition('ds=' + ds) output_table.create_partition('ds=' + ds, if_not_exists=True) tunnel = TableTunnel(odps) upload_session = tunnel.create_upload_session(output_table.name, partition_spec='ds=' + ds) print('output into', output_table_n, 'partition ds=', ds, ':\n', output_table.schema) index=0 with upload_session.open_record_writer(0) as writer: for row in ws.rows: records = output_table.new_record() i=0 for cell in row: if cell is None: records[i] = None else: records[i] = str(cell.value).encode('utf-8', "replace") i=i+1 writer.write(records) index=index+1 if index % 1000 == 0: print(index) upload_session.commit(0) print('===========') print(datetime.datetime.now())
def testRead(self): options.tunnel.pd_mem_cache_size = 200 options.tunnel.pd_row_cache_size = 200 test_table_name = tn('test_pdio_read_into') self.odps.delete_table(test_table_name, if_exists=True, lifecycle=1) table = self.odps.create_table(test_table_name, 'col1 bigint, col2 double, col3 boolean') data = [table.new_record([random.randint(0, 1048576), random.random(), random.random() > 0.5]) for _ in range(10000)] self.odps.write_table(test_table_name, data) tunnel = TableTunnel(self.odps) download_session = tunnel.create_download_session(table.name) reader = download_session.open_pandas_reader(0, download_session.count) result = reader.read() assert_array_equal(result.col1.values, np.array([r[0] for r in data], dtype=np.int64)) assert_array_equal(result.col2.values, np.array([r[1] for r in data], dtype=np.float64)) assert_array_equal(result.col3.values, np.array([r[2] for r in data], dtype=np.bool_))
def _create_table_tunnel(self, endpoint=None): if self._table_tunnel is not None: return self._table_tunnel from ..tunnel import TableTunnel self._table_tunnel = TableTunnel(client=self._client, project=self.project, endpoint=endpoint or self.project._tunnel_endpoint) return self._table_tunnel
def testWriteArray(self): options.tunnel.pd_mem_cache_size = 200 test_table_name = tn('test_pdio_write_array') self.odps.delete_table(test_table_name, if_exists=True, lifecycle=1) table = self.odps.create_table(test_table_name, 'col1 bigint, col2 bigint, col3 double') data = np.random.rand(100, 300) * 1000 tunnel = TableTunnel(self.odps) upload_session = tunnel.create_upload_session(table.name) writer = upload_session.open_pandas_writer(0) writer.write(data) writer.close() upload_session.commit([0]) recv_data = np.empty((100, 300), dtype=np.double) for rec in self.odps.read_table(test_table_name): recv_data[rec[0], rec[1]] = rec[2] assert_array_equal(data, recv_data) table.truncate() tunnel = TableTunnel(self.odps) upload_session = tunnel.create_upload_session(table.name) writer = upload_session.open_pandas_writer(0) writer.write(data, dim_offsets=(500, 100)) writer.close() upload_session.commit([0]) recv_data = np.empty((100, 300), dtype=np.double) for rec in self.odps.read_table(test_table_name): recv_data[rec[0] - 500, rec[1] - 100] = rec[2] assert_array_equal(data, recv_data)
import time from odps.tunnel import TableTunnel num = 0 #初始化相关 print "init table and set odps table attribute" create_table_if_not_exists(root_table) init_root_table_odps(root_table) init_partition(root_table) #parent table data written print "start to sync parent table data....." result = dynamic.find() tunnel = TableTunnel(odps) print "partition values is %s" % get_partition_value(bizdate) upload_session = tunnel.create_upload_session(root_table.parent_odps_table.table_name, \ partition_spec='pt=' + get_partition_value(bizdate)) with upload_session.open_record_writer(block_id=1) as writer: for tr in result: num = num + 1 if num % 1000 == 0: print "%d num records has been sent to odps..." % num record = root_table.parent_odps_table.odps_table.new_record() for (index, columnName) in enumerate(root_table.parent_odps_table.column_names): record[index] = getValue(columnName, tr) writer.write(record) upload_session.commit([1]) print "start to sync childs table data...."
def _execute_arrow_tunnel(cls, ctx, op): from odps import ODPS from odps.tunnel import TableTunnel out = op.outputs[0] if op.table_name is None: # is empty table ctx[out.key] = cls._build_empty_df(out) return project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(odps_params['access_id'], odps_params['secret_access_key'], project=odps_params['project'], endpoint=endpoint) t = o.get_table(op.table_name) tunnel = TableTunnel(o, project=t.project) retry_times = op.retry_times or options.retry_times retries = 0 while True: try: if op.partition_spec is not None: download_session = tunnel.create_download_session( t.name, partition_spec=op.partition_spec) else: download_session = tunnel.create_download_session(t.name) break except: if retries >= retry_times: raise retries += 1 time.sleep(1) logger.debug('Start reading table %s(%s) split from %s to %s', op.table_name, op.partition_spec, op.start_index, op.end_index) if op.nrows is None: count = op.end_index - op.start_index else: count = op.nrows retries = 0 while True: try: with download_session.open_arrow_reader( op.start_index, count, columns=op.columns) as reader: table = reader.read() break except: if retries >= retry_times: raise retries += 1 time.sleep(1) table = cls._append_partition_values(table, op) if op.string_as_binary: table = cls._cast_string_to_binary(table) data = arrow_table_to_pandas_dataframe( table, use_arrow_dtype=op.use_arrow_dtype) data = cls._align_output_data(op, data) logger.debug('Finish reading table %s(%s) split from %s to %s', op.table_name, op.partition_spec, op.start_index, op.end_index) ctx[op.outputs[0].key] = data
def testReadInto(self): options.tunnel.pd_mem_cache_size = 200 test_table_name = tn('test_pdio_read_into') self.odps.delete_table(test_table_name, if_exists=True, lifecycle=1) table = self.odps.create_table(test_table_name, 'col1 bigint, col2 double, col3 boolean') data = [table.new_record([random.randint(0, 1048576), random.random(), random.random() > 0.5]) for _ in range(10000)] self.odps.write_table(test_table_name, data) tunnel = TableTunnel(self.odps) download_session = tunnel.create_download_session(table.name) reader = download_session.open_pandas_reader(0, download_session.count) read_buffer = [np.empty(5000, dtype=np.int64), np.empty(5000, dtype=np.float64), np.empty(5000, dtype=np.bool_)] count = reader.readinto(read_buffer) self.assertEqual(count, 5000) assert_array_equal(read_buffer[0], np.array([r[0] for r in data[:5000]], dtype=np.int64)) assert_array_equal(read_buffer[1], np.array([r[1] for r in data[:5000]], dtype=np.float64)) assert_array_equal(read_buffer[2], np.array([r[2] for r in data[:5000]], dtype=np.bool_)) count = reader.readinto(read_buffer) self.assertEqual(count, 5000) assert_array_equal(read_buffer[0], np.array([r[0] for r in data[5000:]], dtype=np.int64)) assert_array_equal(read_buffer[1], np.array([r[1] for r in data[5000:]], dtype=np.float64)) assert_array_equal(read_buffer[2], np.array([r[2] for r in data[5000:]], dtype=np.bool_)) self.assertEqual(reader.readinto(read_buffer), 0) tunnel = TableTunnel(self.odps) download_session = tunnel.create_download_session(table.name) reader = download_session.open_pandas_reader(0, download_session.count, columns=['col2', 'col3', 'col1']) read_buffer = [np.empty(10000, dtype=np.float64), np.empty(10000, dtype=np.bool_), np.empty(10000, dtype=np.int64)] count = reader.readinto(read_buffer) self.assertEqual(count, 10000) assert_array_equal(read_buffer[0], np.array([r[1] for r in data], dtype=np.float64)) assert_array_equal(read_buffer[1], np.array([r[2] for r in data], dtype=np.bool_)) assert_array_equal(read_buffer[2], np.array([r[0] for r in data], dtype=np.int64)) tunnel = TableTunnel(self.odps) download_session = tunnel.create_download_session(table.name) reader = download_session.open_pandas_reader(0, download_session.count, columns=['col2', 'col3', 'col1']) read_buffer = [np.empty(10000, dtype=np.int64), np.empty(10000, dtype=np.float64), np.empty(10000, dtype=np.bool_)] count = reader.readinto(read_buffer, columns=['col1', 'col2', 'col3']) self.assertEqual(count, 10000) assert_array_equal(read_buffer[0], np.array([r[0] for r in data], dtype=np.int64)) assert_array_equal(read_buffer[1], np.array([r[1] for r in data], dtype=np.float64)) assert_array_equal(read_buffer[2], np.array([r[2] for r in data], dtype=np.bool_)) try: import pandas as pd tunnel = TableTunnel(self.odps) download_session = tunnel.create_download_session(table.name) reader = download_session.open_pandas_reader(0, download_session.count) read_buffer = pd.DataFrame(dict(col1=np.empty(10000, dtype=np.int64), col2=np.empty(10000, dtype=np.float64), col3=np.empty(10000, dtype=np.bool_)), columns='col1 col2 col3'.split()) count = reader.readinto(read_buffer) self.assertEqual(count, 10000) assert_array_equal(read_buffer.col1.values, np.array([r[0] for r in data], dtype=np.int64)) assert_array_equal(read_buffer.col2.values, np.array([r[1] for r in data], dtype=np.float64)) assert_array_equal(read_buffer.col3.values, np.array([r[2] for r in data], dtype=np.bool_)) except ImportError: pass
def testWriteArrays(self): def assert_results(): recv_data = [np.empty((10000, ), dtype=np.int64), np.empty((10000, ), dtype=np.double), np.empty((10000, ), dtype=np.bool_)] for idx, rec in enumerate(self.odps.read_table(test_table_name)): recv_data[0][idx] = rec[0] recv_data[1][idx] = rec[1] recv_data[2][idx] = rec[2] assert_array_equal(raw_data[0], recv_data[0]) assert_array_equal(raw_data[1], recv_data[1]) assert_array_equal(raw_data[2], recv_data[2]) options.tunnel.pd_mem_cache_size = 200 test_table_name = tn('test_pdio_write_arrays') self.odps.delete_table(test_table_name, if_exists=True, lifecycle=1) table = self.odps.create_table(test_table_name, 'col1 bigint, col2 double, col3 boolean') raw_data = [np.random.randint(1048576, size=(10000,)), np.random.rand(10000), np.random.rand(10000) > 0.5] data = raw_data tunnel = TableTunnel(self.odps) upload_session = tunnel.create_upload_session(table.name) writer = upload_session.open_pandas_writer(0) writer.write(data) writer.close() upload_session.commit([0]) assert_results() table.truncate() data = dict(col1=raw_data[0], col2=raw_data[1], col3=raw_data[2]) tunnel = TableTunnel(self.odps) upload_session = tunnel.create_upload_session(table.name) writer = upload_session.open_pandas_writer(0) writer.write(data) writer.close() upload_session.commit([0]) assert_results() table.truncate() try: import pandas as pd data = pd.DataFrame(dict(col1=raw_data[0], col2=raw_data[1], col3=raw_data[2]), columns='col1 col2 col3'.split()) tunnel = TableTunnel(self.odps) upload_session = tunnel.create_upload_session(table.name) writer = upload_session.open_pandas_writer(0) writer.write(data) writer.close() upload_session.commit([0]) assert_results() except ImportError: pass
import time from odps.tunnel import TableTunnel num = 0 #初始化相关 print "init table and set odps table attribute" create_table_if_not_exists(root_table) init_root_table_odps(root_table) init_partition(root_table) #parent table data written print "start to sync parent table data....." result = dynamic.find() tunnel = TableTunnel(odps) print "partition values is %s" % get_partition_value(bizdate) upload_session = tunnel.create_upload_session(root_table.parent_odps_table.table_name, \ partition_spec='pt=' + get_partition_value(bizdate)) with upload_session.open_record_writer(block_id=1) as writer: for tr in result: num = num + 1 if num % 1000 == 0: print "%d num records has been sent to odps..." % num record = root_table.parent_odps_table.odps_table.new_record() for (index, columnName) in enumerate( root_table.parent_odps_table.column_names): record[index] = getValue(columnName, tr) writer.write(record) upload_session.commit([1])
def persist_mars_dataframe(odps, df, table_name, overwrite=False, partition=None, write_batch_size=None, unknown_as_string=None, as_type=None, drop_table=False, create_table=True, drop_partition=False, create_partition=None, lifecycle=None, runtime_endpoint=None, **kw): """ Write Mars DataFrame to table. :param df: Mars DataFrame. :param table_name: table to write. :param overwrite: if overwrite the data. False as default. :param partition: partition spec. :param write_batch_size: batch size of records to write. 1024 as default. :param unknown_as_string: set the columns to string type if it's type is Object. :param as_type: specify column dtypes. {'a': 'string'} will set column `a` as string type. :param drop_table: drop table if exists, False as default :param create_table: create table first if not exits, True as default :param drop_partition: drop partition if exists, False as default :param create_partition: create partition if not exists, None as default :param lifecycle: table lifecycle. If absent, `options.lifecycle` will be used. :return: None """ from .dataframe import write_odps_table from odps.tunnel import TableTunnel dtypes = df.dtypes odps_types = [] names = [] for name, t in zip(dtypes.keys(), list(dtypes.values)): names.append(name) if as_type and name in as_type: odps_types.append(as_type[name]) else: odps_types.append( pd_type_to_odps_type(t, name, unknown_as_string=unknown_as_string)) if partition: p = PartitionSpec(partition) schema = Schema.from_lists(names, odps_types, p.keys, ['string'] * len(p)) else: schema = Schema.from_lists(names, odps_types) if drop_table: odps.delete_table(table_name, if_exists=True) if partition is None: # the non-partitioned table if drop_partition: raise ValueError('Cannot drop partition for non-partition table') if create_partition: raise ValueError('Cannot create partition for non-partition table') if create_table or (not odps.exist_table(table_name)): odps.create_table(table_name, schema, if_not_exists=True, stored_as='aliorc', lifecycle=lifecycle) else: if odps.exist_table(table_name) or not create_table: t = odps.get_table(table_name) table_partition = t.get_partition(partition) if drop_partition: t.delete_partition(table_partition, if_exists=True) if create_partition: t.create_partition(table_partition, if_not_exists=True) else: odps.create_table(table_name, schema, stored_as='aliorc', lifecycle=lifecycle) table = odps.get_table(table_name) if len(table.schema.simple_columns) != len(schema.simple_columns): raise TypeError( 'Table column number is %s while input DataFrame has %s columns' % (len(table.schema.simple_columns), len(schema.simple_columns))) for c_left, c_right in zip(table.schema.simple_columns, schema.simple_columns): if c_left.name.lower() != c_right.name.lower( ) or c_left.type != c_right.type: raise TypeError( 'Column type between provided DataFrame and target table' ' does not agree with each other. DataFrame column %s type is %s,' 'target table column %s type is %s' % (c_right.name, c_right.type, c_left.name, c_left.type)) if partition: table.create_partition(partition, if_not_exists=True) runtime_endpoint = (runtime_endpoint or kw.pop('cupid_internal_endpoint', None) or cupid_options.cupid.runtime.endpoint) odps_params = dict(project=odps.project, endpoint=runtime_endpoint) if isinstance(odps.account, AliyunAccount): odps_params.update( dict(access_id=odps.account.access_id, secret_access_key=odps.account.secret_access_key)) if isinstance(df, pd.DataFrame): from cupid.runtime import RuntimeContext import pyarrow as pa if RuntimeContext.is_context_ready(): _write_table_in_cupid(odps, df, table, partition=partition, overwrite=overwrite, unknown_as_string=unknown_as_string) else: t = odps.get_table(table_name) tunnel = TableTunnel(odps, project=t.project) if partition is not None: upload_session = tunnel.create_upload_session( t.name, partition_spec=partition) else: upload_session = tunnel.create_upload_session(t.name) writer = upload_session.open_arrow_writer(0) arrow_rb = pa.RecordBatch.from_pandas(df) writer.write(arrow_rb) writer.close() upload_session.commit([0]) else: write_odps_table(df, table, partition=partition, overwrite=overwrite, odps_params=odps_params, unknown_as_string=unknown_as_string, write_batch_size=write_batch_size).execute()