Пример #1
0
def download_by_tunnel(table_name, file_path, row_count, pt=None, sep=','):
    """
    通过dataframe的方式读取odps的表数据
    :param table_name:
    :param file_path:
    :return:
    """

    tunnel = TableTunnel(odps)
    if pt is not None:
        download_session = tunnel.create_download_session(table_name,
                                                          partition_spec=pt)
    else:
        download_session = tunnel.create_download_session(table_name)
    with open(file_path, 'w') as f:
        with download_session.open_record_reader(
                0, download_session.count) as reader:
            for record in reader:
                line = ''
                for i in range(row_count):
                    if i > 0:
                        line = line + sep
                    line = line + str(record[i])
                line = line + '\n'
                f.writelines(line)
Пример #2
0
    def _execute_arrow_tunnel(cls, ctx, op):
        from odps import ODPS
        from odps.tunnel import TableTunnel
        import pyarrow as pa
        import pandas as pd

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        t = o.get_table(op.table_name)
        tunnel = TableTunnel(o, project=t.project)
        retry_times = options.retry_times

        retries = 0
        while True:
            try:
                if op.partition_spec is not None:
                    upload_session = tunnel.create_upload_session(
                        t.name, partition_spec=op.partition_spec)
                else:
                    upload_session = tunnel.create_upload_session(t.name)
                break
            except:
                if retries >= retry_times:
                    raise
                time.sleep(1)

        logger.debug('Start writing table %s split index: %s', op.table_name,
                     op.inputs[0].index)

        retries = 0
        while True:
            try:
                writer = upload_session.open_arrow_writer(0)
                arrow_rb = pa.RecordBatch.from_pandas(ctx[op.inputs[0].key])
                writer.write(arrow_rb)
                writer.close()
                break
            except:
                if retries >= retry_times:
                    raise
                time.sleep(1)

        upload_session.commit([0])
        logger.debug('Finish writing table %s split index: %s', op.table_name,
                     op.inputs[0].index)

        ctx[op.outputs[0].key] = pd.DataFrame()
Пример #3
0
def download_infos(tablename, storename, keys):
    o = ODPS("LTAIWt3hG5GvYBhX", "RriedkAIENmPvXvRmQcy9wRqOYx3QV", 'graph_embedding_intern_dev',
             endpoint='http://service-corp.odps.aliyun-inc.com/api')

    project = o.get_project()
    csv_file = open(storename, mode='w')
    writer = csv.writer(csv_file, delimiter='\t')

    tunnel = TableTunnel(o)
    download_session = tunnel.create_download_session(tablename)
    with download_session.open_record_reader(0, download_session.count) as reader:
        for record in reader:
            info = [record[key] for key in keys]
            writer.writerow(info)
    print("complete storing {}".format(storename))
Пример #4
0
    def _execute_arrow_tunnel(cls, ctx, op):
        from odps import ODPS
        from odps.tunnel import TableTunnel

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        t = o.get_table(op.table_name)
        tunnel = TableTunnel(o, project=t.project)

        if op.partition_spec is not None:
            download_session = tunnel.create_download_session(
                t.name, partition_spec=op.partition_spec)
        else:
            download_session = tunnel.create_download_session(t.name)
        logger.debug('Start reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        if op.nrows is None:
            count = op.end_index - op.start_index
        else:
            count = op.nrows

        with download_session.open_arrow_reader(op.start_index,
                                                count,
                                                columns=op.columns) as reader:
            table = reader.read()

        table = cls._append_partition_values(table, op)
        if op.string_as_binary:
            table = cls._cast_string_to_binary(table)
        data = arrow_table_to_pandas_dataframe(
            table, use_arrow_dtype=op.use_arrow_dtype)

        data = cls._align_columns(data, op.outputs[0].dtypes)

        logger.debug('Finish reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        ctx[op.outputs[0].key] = data
Пример #5
0
 def read_table_split(conn,
                      download_id,
                      start,
                      count,
                      idx,
                      partition_spec=None):
     # read part data
     from odps.tunnel import TableTunnel
     tunnel = TableTunnel(client=rest_client,
                          project=project,
                          endpoint=tunnel_endpoint)
     session = tunnel.create_download_session(
         table_name,
         download_id=download_id,
         partition_spec=partition_spec)
     data = session.open_record_reader(start, count).to_pandas()
     conn.send((idx, data))
Пример #6
0
def uploadexcel(input_file,output_table_n='defult'):
    odps =  ODPS(ACCESS_KEY_ID,
             ACCESS_KEY_SECRET,
             PROJECT,
             endpoint='http://service.odps.aliyun.com/api')

    project = odps.get_project()  # 取到默认项目
    print(project)
    ds = datetime.datetime.now().strftime('%Y%m%d')
    print(ds)

    wb = openpyxl.load_workbook(filename=input_file,read_only=True)
    ws = wb.active
    print(datetime.datetime.now())

    output_table = odps.get_table(output_table_n)
    if output_table.exist_partition('ds=' + ds):
        output_table.delete_partition('ds=' + ds)
    output_table.create_partition('ds=' + ds, if_not_exists=True)

    tunnel = TableTunnel(odps)
    upload_session = tunnel.create_upload_session(output_table.name, partition_spec='ds=' + ds)
    print('output into', output_table_n, 'partition ds=', ds, ':\n', output_table.schema)
    index=0
    with upload_session.open_record_writer(0) as writer:
        for row in ws.rows:
            records = output_table.new_record()
            i=0
            for cell in row:
                if cell is None:
                    records[i] = None
                else:
                    records[i] = str(cell.value).encode('utf-8', "replace")
                i=i+1
            writer.write(records)
            index=index+1
            if index % 1000 == 0:
                print(index)
    upload_session.commit(0)

    print('===========')
    print(datetime.datetime.now())
Пример #7
0
    def testRead(self):
        options.tunnel.pd_mem_cache_size = 200
        options.tunnel.pd_row_cache_size = 200

        test_table_name = tn('test_pdio_read_into')
        self.odps.delete_table(test_table_name, if_exists=True, lifecycle=1)
        table = self.odps.create_table(test_table_name, 'col1 bigint, col2 double, col3 boolean')

        data = [table.new_record([random.randint(0, 1048576), random.random(), random.random() > 0.5])
                for _ in range(10000)]
        self.odps.write_table(test_table_name, data)

        tunnel = TableTunnel(self.odps)
        download_session = tunnel.create_download_session(table.name)
        reader = download_session.open_pandas_reader(0, download_session.count)

        result = reader.read()
        assert_array_equal(result.col1.values, np.array([r[0] for r in data], dtype=np.int64))
        assert_array_equal(result.col2.values, np.array([r[1] for r in data], dtype=np.float64))
        assert_array_equal(result.col3.values, np.array([r[2] for r in data], dtype=np.bool_))
Пример #8
0
    def _create_table_tunnel(self, endpoint=None):
        if self._table_tunnel is not None:
            return self._table_tunnel

        from ..tunnel import TableTunnel

        self._table_tunnel = TableTunnel(client=self._client,
                                         project=self.project,
                                         endpoint=endpoint
                                         or self.project._tunnel_endpoint)
        return self._table_tunnel
Пример #9
0
    def testWriteArray(self):
        options.tunnel.pd_mem_cache_size = 200

        test_table_name = tn('test_pdio_write_array')
        self.odps.delete_table(test_table_name, if_exists=True, lifecycle=1)
        table = self.odps.create_table(test_table_name, 'col1 bigint, col2 bigint, col3 double')

        data = np.random.rand(100, 300) * 1000

        tunnel = TableTunnel(self.odps)
        upload_session = tunnel.create_upload_session(table.name)
        writer = upload_session.open_pandas_writer(0)

        writer.write(data)

        writer.close()
        upload_session.commit([0])

        recv_data = np.empty((100, 300), dtype=np.double)
        for rec in self.odps.read_table(test_table_name):
            recv_data[rec[0], rec[1]] = rec[2]

        assert_array_equal(data, recv_data)

        table.truncate()

        tunnel = TableTunnel(self.odps)
        upload_session = tunnel.create_upload_session(table.name)
        writer = upload_session.open_pandas_writer(0)

        writer.write(data, dim_offsets=(500, 100))

        writer.close()
        upload_session.commit([0])

        recv_data = np.empty((100, 300), dtype=np.double)
        for rec in self.odps.read_table(test_table_name):
            recv_data[rec[0] - 500, rec[1] - 100] = rec[2]

        assert_array_equal(data, recv_data)
Пример #10
0
import time
from odps.tunnel import TableTunnel

num = 0
#初始化相关
print "init table and set odps table attribute"

create_table_if_not_exists(root_table)
init_root_table_odps(root_table)
init_partition(root_table)

#parent table data written
print "start to sync parent table data....."
result = dynamic.find()
tunnel = TableTunnel(odps)
print "partition values is %s" % get_partition_value(bizdate)
upload_session = tunnel.create_upload_session(root_table.parent_odps_table.table_name, \
 partition_spec='pt=' + get_partition_value(bizdate))
with upload_session.open_record_writer(block_id=1) as writer:
    for tr in result:
        num = num + 1
        if num % 1000 == 0:
            print "%d num records has been sent to odps..." % num
        record = root_table.parent_odps_table.odps_table.new_record()
        for (index, columnName) in enumerate(root_table.parent_odps_table.column_names):
            record[index] = getValue(columnName, tr)
        writer.write(record)
upload_session.commit([1])

print "start to sync childs table data...."
Пример #11
0
    def _execute_arrow_tunnel(cls, ctx, op):
        from odps import ODPS
        from odps.tunnel import TableTunnel

        out = op.outputs[0]

        if op.table_name is None:
            # is empty table
            ctx[out.key] = cls._build_empty_df(out)
            return

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        t = o.get_table(op.table_name)
        tunnel = TableTunnel(o, project=t.project)
        retry_times = op.retry_times or options.retry_times

        retries = 0
        while True:
            try:
                if op.partition_spec is not None:
                    download_session = tunnel.create_download_session(
                        t.name, partition_spec=op.partition_spec)
                else:
                    download_session = tunnel.create_download_session(t.name)
                break
            except:
                if retries >= retry_times:
                    raise
                retries += 1
                time.sleep(1)

        logger.debug('Start reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        if op.nrows is None:
            count = op.end_index - op.start_index
        else:
            count = op.nrows

        retries = 0
        while True:
            try:
                with download_session.open_arrow_reader(
                        op.start_index, count, columns=op.columns) as reader:
                    table = reader.read()
                break
            except:
                if retries >= retry_times:
                    raise
                retries += 1
                time.sleep(1)

        table = cls._append_partition_values(table, op)
        if op.string_as_binary:
            table = cls._cast_string_to_binary(table)
        data = arrow_table_to_pandas_dataframe(
            table, use_arrow_dtype=op.use_arrow_dtype)
        data = cls._align_output_data(op, data)

        logger.debug('Finish reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        ctx[op.outputs[0].key] = data
Пример #12
0
    def testReadInto(self):
        options.tunnel.pd_mem_cache_size = 200

        test_table_name = tn('test_pdio_read_into')
        self.odps.delete_table(test_table_name, if_exists=True, lifecycle=1)
        table = self.odps.create_table(test_table_name, 'col1 bigint, col2 double, col3 boolean')

        data = [table.new_record([random.randint(0, 1048576), random.random(), random.random() > 0.5])
                for _ in range(10000)]
        self.odps.write_table(test_table_name, data)

        tunnel = TableTunnel(self.odps)
        download_session = tunnel.create_download_session(table.name)
        reader = download_session.open_pandas_reader(0, download_session.count)

        read_buffer = [np.empty(5000, dtype=np.int64), np.empty(5000, dtype=np.float64),
                       np.empty(5000, dtype=np.bool_)]
        count = reader.readinto(read_buffer)

        self.assertEqual(count, 5000)
        assert_array_equal(read_buffer[0], np.array([r[0] for r in data[:5000]], dtype=np.int64))
        assert_array_equal(read_buffer[1], np.array([r[1] for r in data[:5000]], dtype=np.float64))
        assert_array_equal(read_buffer[2], np.array([r[2] for r in data[:5000]], dtype=np.bool_))

        count = reader.readinto(read_buffer)

        self.assertEqual(count, 5000)
        assert_array_equal(read_buffer[0], np.array([r[0] for r in data[5000:]], dtype=np.int64))
        assert_array_equal(read_buffer[1], np.array([r[1] for r in data[5000:]], dtype=np.float64))
        assert_array_equal(read_buffer[2], np.array([r[2] for r in data[5000:]], dtype=np.bool_))

        self.assertEqual(reader.readinto(read_buffer), 0)

        tunnel = TableTunnel(self.odps)
        download_session = tunnel.create_download_session(table.name)
        reader = download_session.open_pandas_reader(0, download_session.count, columns=['col2', 'col3', 'col1'])

        read_buffer = [np.empty(10000, dtype=np.float64), np.empty(10000, dtype=np.bool_),
                       np.empty(10000, dtype=np.int64)]
        count = reader.readinto(read_buffer)

        self.assertEqual(count, 10000)
        assert_array_equal(read_buffer[0], np.array([r[1] for r in data], dtype=np.float64))
        assert_array_equal(read_buffer[1], np.array([r[2] for r in data], dtype=np.bool_))
        assert_array_equal(read_buffer[2], np.array([r[0] for r in data], dtype=np.int64))

        tunnel = TableTunnel(self.odps)
        download_session = tunnel.create_download_session(table.name)
        reader = download_session.open_pandas_reader(0, download_session.count, columns=['col2', 'col3', 'col1'])

        read_buffer = [np.empty(10000, dtype=np.int64), np.empty(10000, dtype=np.float64),
                       np.empty(10000, dtype=np.bool_)]
        count = reader.readinto(read_buffer, columns=['col1', 'col2', 'col3'])

        self.assertEqual(count, 10000)
        assert_array_equal(read_buffer[0], np.array([r[0] for r in data], dtype=np.int64))
        assert_array_equal(read_buffer[1], np.array([r[1] for r in data], dtype=np.float64))
        assert_array_equal(read_buffer[2], np.array([r[2] for r in data], dtype=np.bool_))

        try:
            import pandas as pd
            tunnel = TableTunnel(self.odps)
            download_session = tunnel.create_download_session(table.name)
            reader = download_session.open_pandas_reader(0, download_session.count)

            read_buffer = pd.DataFrame(dict(col1=np.empty(10000, dtype=np.int64),
                                            col2=np.empty(10000, dtype=np.float64),
                                            col3=np.empty(10000, dtype=np.bool_)), columns='col1 col2 col3'.split())

            count = reader.readinto(read_buffer)
            self.assertEqual(count, 10000)

            assert_array_equal(read_buffer.col1.values, np.array([r[0] for r in data], dtype=np.int64))
            assert_array_equal(read_buffer.col2.values, np.array([r[1] for r in data], dtype=np.float64))
            assert_array_equal(read_buffer.col3.values, np.array([r[2] for r in data], dtype=np.bool_))
        except ImportError:
            pass
Пример #13
0
    def testWriteArrays(self):
        def assert_results():
            recv_data = [np.empty((10000, ), dtype=np.int64), np.empty((10000, ), dtype=np.double),
                         np.empty((10000, ), dtype=np.bool_)]

            for idx, rec in enumerate(self.odps.read_table(test_table_name)):
                recv_data[0][idx] = rec[0]
                recv_data[1][idx] = rec[1]
                recv_data[2][idx] = rec[2]

            assert_array_equal(raw_data[0], recv_data[0])
            assert_array_equal(raw_data[1], recv_data[1])
            assert_array_equal(raw_data[2], recv_data[2])

        options.tunnel.pd_mem_cache_size = 200

        test_table_name = tn('test_pdio_write_arrays')
        self.odps.delete_table(test_table_name, if_exists=True, lifecycle=1)
        table = self.odps.create_table(test_table_name, 'col1 bigint, col2 double, col3 boolean')

        raw_data = [np.random.randint(1048576, size=(10000,)), np.random.rand(10000),
                    np.random.rand(10000) > 0.5]
        data = raw_data

        tunnel = TableTunnel(self.odps)
        upload_session = tunnel.create_upload_session(table.name)
        writer = upload_session.open_pandas_writer(0)

        writer.write(data)

        writer.close()
        upload_session.commit([0])
        assert_results()

        table.truncate()

        data = dict(col1=raw_data[0], col2=raw_data[1], col3=raw_data[2])

        tunnel = TableTunnel(self.odps)
        upload_session = tunnel.create_upload_session(table.name)
        writer = upload_session.open_pandas_writer(0)

        writer.write(data)

        writer.close()
        upload_session.commit([0])
        assert_results()

        table.truncate()

        try:
            import pandas as pd
            data = pd.DataFrame(dict(col1=raw_data[0], col2=raw_data[1], col3=raw_data[2]),
                                columns='col1 col2 col3'.split())

            tunnel = TableTunnel(self.odps)
            upload_session = tunnel.create_upload_session(table.name)
            writer = upload_session.open_pandas_writer(0)

            writer.write(data)

            writer.close()
            upload_session.commit([0])
            assert_results()
        except ImportError:
            pass
Пример #14
0
import time
from odps.tunnel import TableTunnel

num = 0
#初始化相关
print "init table and set odps table attribute"

create_table_if_not_exists(root_table)
init_root_table_odps(root_table)
init_partition(root_table)

#parent table data written
print "start to sync parent table data....."
result = dynamic.find()
tunnel = TableTunnel(odps)
print "partition values is %s" % get_partition_value(bizdate)
upload_session = tunnel.create_upload_session(root_table.parent_odps_table.table_name, \
 partition_spec='pt=' + get_partition_value(bizdate))
with upload_session.open_record_writer(block_id=1) as writer:
    for tr in result:
        num = num + 1
        if num % 1000 == 0:
            print "%d num records has been sent to odps..." % num
        record = root_table.parent_odps_table.odps_table.new_record()
        for (index, columnName) in enumerate(
                root_table.parent_odps_table.column_names):
            record[index] = getValue(columnName, tr)
        writer.write(record)
upload_session.commit([1])
Пример #15
0
def persist_mars_dataframe(odps,
                           df,
                           table_name,
                           overwrite=False,
                           partition=None,
                           write_batch_size=None,
                           unknown_as_string=None,
                           as_type=None,
                           drop_table=False,
                           create_table=True,
                           drop_partition=False,
                           create_partition=None,
                           lifecycle=None,
                           runtime_endpoint=None,
                           **kw):
    """
    Write Mars DataFrame to table.

    :param df: Mars DataFrame.
    :param table_name: table to write.
    :param overwrite: if overwrite the data. False as default.
    :param partition: partition spec.
    :param write_batch_size: batch size of records to write. 1024 as default.
    :param unknown_as_string: set the columns to string type if it's type is Object.
    :param as_type: specify column dtypes. {'a': 'string'} will set column `a` as string type.
    :param drop_table: drop table if exists, False as default
    :param create_table: create table first if not exits, True as default
    :param drop_partition: drop partition if exists, False as default
    :param create_partition: create partition if not exists, None as default
    :param lifecycle: table lifecycle. If absent, `options.lifecycle` will be used.

    :return: None
    """
    from .dataframe import write_odps_table
    from odps.tunnel import TableTunnel

    dtypes = df.dtypes
    odps_types = []
    names = []
    for name, t in zip(dtypes.keys(), list(dtypes.values)):
        names.append(name)
        if as_type and name in as_type:
            odps_types.append(as_type[name])
        else:
            odps_types.append(
                pd_type_to_odps_type(t,
                                     name,
                                     unknown_as_string=unknown_as_string))
    if partition:
        p = PartitionSpec(partition)
        schema = Schema.from_lists(names, odps_types, p.keys,
                                   ['string'] * len(p))
    else:
        schema = Schema.from_lists(names, odps_types)

    if drop_table:
        odps.delete_table(table_name, if_exists=True)

    if partition is None:
        # the non-partitioned table
        if drop_partition:
            raise ValueError('Cannot drop partition for non-partition table')
        if create_partition:
            raise ValueError('Cannot create partition for non-partition table')

        if create_table or (not odps.exist_table(table_name)):
            odps.create_table(table_name,
                              schema,
                              if_not_exists=True,
                              stored_as='aliorc',
                              lifecycle=lifecycle)
    else:
        if odps.exist_table(table_name) or not create_table:
            t = odps.get_table(table_name)
            table_partition = t.get_partition(partition)
            if drop_partition:
                t.delete_partition(table_partition, if_exists=True)
            if create_partition:
                t.create_partition(table_partition, if_not_exists=True)

        else:
            odps.create_table(table_name,
                              schema,
                              stored_as='aliorc',
                              lifecycle=lifecycle)

    table = odps.get_table(table_name)

    if len(table.schema.simple_columns) != len(schema.simple_columns):
        raise TypeError(
            'Table column number is %s while input DataFrame has %s columns' %
            (len(table.schema.simple_columns), len(schema.simple_columns)))

    for c_left, c_right in zip(table.schema.simple_columns,
                               schema.simple_columns):
        if c_left.name.lower() != c_right.name.lower(
        ) or c_left.type != c_right.type:
            raise TypeError(
                'Column type between provided DataFrame and target table'
                ' does not agree with each other. DataFrame column %s type is %s,'
                'target table column %s type is %s' %
                (c_right.name, c_right.type, c_left.name, c_left.type))

    if partition:
        table.create_partition(partition, if_not_exists=True)
    runtime_endpoint = (runtime_endpoint
                        or kw.pop('cupid_internal_endpoint', None)
                        or cupid_options.cupid.runtime.endpoint)
    odps_params = dict(project=odps.project, endpoint=runtime_endpoint)
    if isinstance(odps.account, AliyunAccount):
        odps_params.update(
            dict(access_id=odps.account.access_id,
                 secret_access_key=odps.account.secret_access_key))
    if isinstance(df, pd.DataFrame):
        from cupid.runtime import RuntimeContext
        import pyarrow as pa

        if RuntimeContext.is_context_ready():
            _write_table_in_cupid(odps,
                                  df,
                                  table,
                                  partition=partition,
                                  overwrite=overwrite,
                                  unknown_as_string=unknown_as_string)
        else:
            t = odps.get_table(table_name)
            tunnel = TableTunnel(odps, project=t.project)

            if partition is not None:
                upload_session = tunnel.create_upload_session(
                    t.name, partition_spec=partition)
            else:
                upload_session = tunnel.create_upload_session(t.name)

            writer = upload_session.open_arrow_writer(0)
            arrow_rb = pa.RecordBatch.from_pandas(df)
            writer.write(arrow_rb)
            writer.close()
            upload_session.commit([0])

    else:
        write_odps_table(df,
                         table,
                         partition=partition,
                         overwrite=overwrite,
                         odps_params=odps_params,
                         unknown_as_string=unknown_as_string,
                         write_batch_size=write_batch_size).execute()