Пример #1
0
    def check_instance_idle(self):
        last_active_time = self._last_activity_time
        has_running = False
        for ref in self._session_refs.values():
            for info in ref.get_graph_infos().values():
                if info.get('end_time') is None:
                    has_running = True
                    break
                else:
                    last_active_time = max(info['end_time'], last_active_time)
            if has_running:
                break

        if not has_running and last_active_time < time.time(
        ) - self._idle_timeout:
            # timeout: we need to kill the instance
            from odps import ODPS
            from odps.accounts import BearerTokenAccount
            from cupid.runtime import context

            logger.warning('Timeout met, killing the instance now.')

            bearer_token = context().get_bearer_token()
            account = BearerTokenAccount(bearer_token)
            project = os.environ['ODPS_PROJECT_NAME']
            endpoint = os.environ['ODPS_RUNTIME_ENDPOINT']
            o = ODPS(None,
                     None,
                     account=account,
                     project=project,
                     endpoint=endpoint)

            o.stop_instance(os.environ['MARS_K8S_POD_NAMESPACE'])
        else:
            self.ref().check_instance_idle(_delay=10, _tell=True, _wait=False)
Пример #2
0
def _handle_terminate_instance(sock):
    from cupid.runtime import context, RuntimeContext
    from odps import ODPS
    from odps.accounts import BearerTokenAccount

    try:
        cmd_len, = struct.unpack('<I', sock.recv(4))
        # dict with key
        cmd_body = pickle.loads(sock.recv(cmd_len))

        instance_id = cmd_body['instance_id']

        if not RuntimeContext.is_context_ready():
            logger.warning('Cupid context not ready')
        else:
            bearer_token = context().get_bearer_token()
            account = BearerTokenAccount(bearer_token)
            project = os.environ['ODPS_PROJECT_NAME']
            endpoint = os.environ['ODPS_RUNTIME_ENDPOINT']
            o = ODPS(None,
                     None,
                     account=account,
                     project=project,
                     endpoint=endpoint)

            o.stop_instance(instance_id)
    except:
        logger.exception('Failed to put kv value')
        _write_request_result(sock, False, exc_info=sys.exc_info())
Пример #3
0
def load2excel(filepath, begin, end):
    o = ODPS(ACCESS_KEY_ID,
             ACCESS_KEY_SECRET,
             PROJECT,
             endpoint='http://service.odps.aliyun.com/api')

    print(o)

    sql = "select * from stat_table where ds>={begin} and ds<={end}".format(begin=begin, end=end)
    with o.execute_sql(sql).open_reader() as reader:
        print(sql)
        print(reader.count)
        table_title = reader._schema.names
        print(table_title)

        workbook = openpyxl.Workbook()
        worksheet = workbook.create_sheet('Sheet')
        for i in range(len(table_title)):
            worksheet.cell(1, i + 1, table_title[i])

        write_count = 2
        for r in reader:
            for i in range(len(r.values)):  # 10个依次写入当前行
                worksheet.cell(write_count, i + 1, r.values[i])
            write_count = write_count + 1
        workbook.save(filename=filepath)
Пример #4
0
    def setUp(self):
        self.project = os.environ[MaxComputeConfig.PROJECT_NAME]
        access_id = os.environ[MaxComputeConfig.ACCESS_ID]
        access_key = os.environ[MaxComputeConfig.ACCESS_KEY]
        endpoint = os.environ.get(MaxComputeConfig.ENDPOINT)
        tunnel_endpoint = os.environ.get(MaxComputeConfig.TUNNEL_ENDPOINT,
                                         None)
        self.test_table = "test_odps_data_reader_%d_%d" % (
            int(time.time()),
            random.randint(1, 101),
        )
        self.odps_client = ODPS(access_id, access_key, self.project, endpoint)
        create_iris_odps_table(self.odps_client, self.project, self.test_table)
        self.records_per_task = 50

        self.reader = ODPSDataReader(
            project=self.project,
            access_id=access_id,
            access_key=access_key,
            endpoint=endpoint,
            table=self.test_table,
            tunnel_endpoint=tunnel_endpoint,
            num_processes=1,
            records_per_task=self.records_per_task,
        )
Пример #5
0
    def check_instance_idle(self):
        from cupid.runtime import context

        has_running, active_time_from_service = self._get_service_activity_info(
        )
        if active_time_from_service != self._last_active_time_from_service:
            self._last_active_time = active_time_from_service
            self._last_active_time_from_service = active_time_from_service
        elif has_running:
            self._last_active_time = time.time()

        if self._last_active_time < time.time() - self._idle_timeout:
            # timeout: we need to kill the instance
            from odps import ODPS
            from odps.accounts import BearerTokenAccount

            logger.warning('Timeout met, killing the instance now.')

            bearer_token = context().get_bearer_token()
            account = BearerTokenAccount(bearer_token)
            project = os.environ['ODPS_PROJECT_NAME']
            endpoint = os.environ['ODPS_RUNTIME_ENDPOINT']
            o = ODPS(None,
                     None,
                     account=account,
                     project=project,
                     endpoint=endpoint)

            o.stop_instance(os.environ['MARS_K8S_POD_NAMESPACE'])
        else:
            kv_store = context().kv_store()
            kv_store[CUPID_LAST_IDLE_TIME_KEY] = str(self._last_active_time)
            self.ref().check_instance_idle(_delay=10, _tell=True, _wait=False)
Пример #6
0
    def _execute_in_cupid(cls, ctx, op):
        import os

        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount

        cupid_client = CupidServiceClient()
        to_store_data = ctx[op.inputs[0].key]

        bearer_token = cupid_client.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(None,
                 None,
                 account=account,
                 project=odps_params['project'],
                 endpoint=endpoint)
        odps_schema = o.get_table(op.table_name).schema
        project_name, table_name = op.table_name.split('.')

        writer_config = dict(_table_name=table_name,
                             _project_name=project_name,
                             _table_schema=odps_schema,
                             _partition_spec=op.partition_spec,
                             _block_id=op.block_id,
                             _handle=op.cupid_handle)
        cupid_client.write_table_data(writer_config, to_store_data,
                                      op.write_batch_size)
        ctx[op.outputs[0].key] = pd.DataFrame()
Пример #7
0
def get_lt_etid():
    logging.info('12. 正在获取其他的企业 ')
    print utils.current_time(), '建立odps链接..'
    o = ODPS('LTAIzEuNzcL6qJJ8', 'eUAgj9ijhWCvOQ3w5Uv3FkwhNxvPF2',
             'database_test', 'http://service.odps.aliyun.com/api')
    print utils.current_time(), '进行查询...'
    pt = time.strftime('%Y%m%d', time.localtime(int(time.time() - 86400)))
    res = o.execute_sql(
        "select distinct etid from et_jobs where pt='{}' and isheadhunter=1".
        format(pt))
    print utils.current_time(), '处理查询结果...'
    etid_set = set()
    conn = utils.get_local_db()
    addtime = int(time.time())
    cnt = 0
    with res.open_reader() as reader:
        print utils.current_time(), '共需处理{}条!'.format(reader.count)
        for record in reader:
            etid_set.add((record['etid'], ))
            if len(etid_set) >= 1000:
                conn.executemany(
                    "insert into et_info_status(etid,addtime) values(%s,{})on duplicate key update etid=values(etid), addtime=values(addtime)"
                    .format(addtime), list(etid_set))
                cnt += 1000
                print utils.current_time(), '当前已写入{}条!'.format(cnt)
                etid_set.clear()
    if len(etid_set) > 0:
        conn.executemany(
            "insert into et_info_status(etid,addtime) values(%s,{})on duplicate key update etid=values(etid), addtime=values(addtime)"
            .format(addtime), list(etid_set))
        cnt += len(etid_set)
        print utils.current_time(), '当前已写入{}条!'.format(cnt)
    conn.close()
    return reader.count
Пример #8
0
 def __init__(self, conn_uri):
     super(MaxComputeConnection, self).__init__(conn_uri)
     user, pwd, endpoint, proj = MaxComputeConnection.get_uri_parts(
         conn_uri)
     self.driver = "maxcompute"
     self.params["database"] = proj
     self.endpoint = endpoint
     self._conn = ODPS(user, pwd, project=proj, endpoint=endpoint)
Пример #9
0
class OdpsConn(object):
    """
    odps 连接
    """
    def __init__(self, project):
        self.access_id = ODPSCONF.key
        self.access_key = ODPSCONF.sec
        self.project = project

        self.odps = None

    def __enter__(self):
        try:
            self.odps = ODPS(self.access_id, self.access_key, self.project)
        except Exception as exc:
            raise ValueError(exc.message)
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        del self

    def get_table_count_and_names(self):
        """
        获取一个项目下的table的数量和table的名字
        :return:
        """
        tables = self.odps.list_tables()
        names = [table.name for table in tables]
        count = len(names)
        return count, names

    def get_table_schema(self, tname):
        """
        获取表字段
        :return: 
        """
        table = self.odps.get_table(tname)
        _sa = table.schema
        _columns = _sa.columns
        schema = [item.name for item in _columns]
        return schema

    def execute_sql(self, sql):
        rest = []
        with self.odps.execute_sql(sql).open_reader() as reader:
            for record in reader:
                rest.append(record.values)
        return rest

    def get_table_last_update_time(self, tname):
        t = self.odps.get_table(tname)
        last_update_time = t.last_modified_time if t else None
        return last_update_time

    def count_table(self, table):
        sql = 'select count(1) from %s' % table
        with self.odps.execute_sql(sql).open_reader() as reader:
            return reader[0].values[0]
    def tile(cls, op):
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context

        bearer_token = context().get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        o = ODPS(None, None, account=account, **op.odps_params)
        cupid_session = CupidSession(o)

        data_src = o.get_table(op.table_name)

        logger.debug('Start creating upload session from cupid.')
        upload_session = cupid_session.create_upload_session(data_src)

        input_df = op.inputs[0]

        out_chunks = []
        out_chunk_shape = (0,) * len(input_df.shape)
        blocks = {}
        for chunk in input_df.chunks:
            block_id = str(int(time.time())) + '_' + str(uuid.uuid4()).replace('-', '')
            chunk_op = DataFrameWriteTableSplit(dtypes=op.dtypes, table_name=op.table_name,
                                                partition_spec=op.partition_spec,
                                                cupid_handle=to_str(upload_session.handle),
                                                block_id=block_id, write_batch_size=op.write_batch_size)
            out_chunk = chunk_op.new_chunk([chunk], shape=out_chunk_shape, index=chunk.index, dtypes=chunk.dtypes)
            out_chunks.append(out_chunk)
            blocks[block_id] = op.partition_spec

        # build commit tree
        combine_size = 8
        chunks = out_chunks
        while len(chunks) > combine_size:
            new_chunks = []
            for i in range(0, len(chunks), combine_size):
                chks = chunks[i: i + combine_size]
                if len(chks) == 1:
                    chk = chks[0]
                else:
                    chk_op = DataFrameWriteTableCommit(dtypes=op.dtypes, is_terminal=False)
                    chk = chk_op.new_chunk(chks, shape=out_chunk_shape, dtypes=op.dtypes)
                new_chunks.append(chk)
            chunks = new_chunks

        assert len(chunks) < combine_size

        commit_table_op = DataFrameWriteTableCommit(dtypes=op.dtypes, table_name=op.table_name, blocks=blocks,
                                                    cupid_handle=to_str(upload_session.handle),
                                                    overwrite=op.overwrite, odps_params=op.odps_params,
                                                    is_terminal=True)
        commit_table_chunk = commit_table_op.new_chunk(chunks, shape=out_chunk_shape, dtypes=op.dtypes)

        out_df = op.outputs[0]
        new_op = op.copy()
        return new_op.new_dataframes(op.inputs, shape=out_df.shape,
                                     dtypes=out_df.dtypes, chunks=[commit_table_chunk],
                                     nsplits=((0,),) * len(out_chunk_shape))
Пример #11
0
 def _get_table_schema(self):
     odps_client = ODPS(
         access_id=self._kwargs["access_id"],
         secret_access_key=self._kwargs["access_key"],
         project=self._kwargs["project"],
         endpoint=self._kwargs.get("endpoint"),
     )
     odps_table = odps_client.get_table(self._kwargs["table"])
     return odps_table.schema
Пример #12
0
    def _execute_arrow_tunnel(cls, ctx, op):
        from odps import ODPS
        from odps.tunnel import TableTunnel
        import pyarrow as pa
        import pandas as pd

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        t = o.get_table(op.table_name)
        tunnel = TableTunnel(o, project=t.project)
        retry_times = options.retry_times

        retries = 0
        while True:
            try:
                if op.partition_spec is not None:
                    upload_session = tunnel.create_upload_session(
                        t.name, partition_spec=op.partition_spec)
                else:
                    upload_session = tunnel.create_upload_session(t.name)
                break
            except:
                if retries >= retry_times:
                    raise
                time.sleep(1)

        logger.debug('Start writing table %s split index: %s', op.table_name,
                     op.inputs[0].index)

        retries = 0
        while True:
            try:
                writer = upload_session.open_arrow_writer(0)
                arrow_rb = pa.RecordBatch.from_pandas(ctx[op.inputs[0].key])
                writer.write(arrow_rb)
                writer.close()
                break
            except:
                if retries >= retry_times:
                    raise
                time.sleep(1)

        upload_session.commit([0])
        logger.debug('Finish writing table %s split index: %s', op.table_name,
                     op.inputs[0].index)

        ctx[op.outputs[0].key] = pd.DataFrame()
Пример #13
0
    def testTokenizedSignServerAccount(self):
        server = SignServer(token=str(uuid.uuid4()))
        server.accounts[
            self.odps.account.access_id] = self.odps.account.secret_access_key
        try:
            server.start(('127.0.0.1', 0))
            account = SignServerAccount(self.odps.account.access_id,
                                        server.server.server_address)
            odps = ODPS(None,
                        None,
                        self.odps.project,
                        self.odps.endpoint,
                        account=account)
            self.assertRaises(
                SignServerError,
                lambda: odps.delete_table(tn('test_sign_account_table'),
                                          if_exists=True))

            account = SignServerAccount(self.odps.account.access_id,
                                        server.server.server_address,
                                        token=server.token)
            odps = ODPS(None,
                        None,
                        self.odps.project,
                        self.odps.endpoint,
                        account=account)
            odps.delete_table(tn('test_sign_account_table'), if_exists=True)
            t = odps.create_table(tn('test_sign_account_table'),
                                  'col string',
                                  lifecycle=1)
            self.assertTrue(odps.exist_table(tn('test_sign_account_table')))
            t.drop(async=True)
        finally:
            server.stop()
Пример #14
0
 def __init__(self, conn_uri):
     super().__init__(conn_uri)
     self.params["database"] = self.params["curr_project"]
     # compose an endpoint, only keep the host and path and replace scheme
     endpoint = self.uripts._replace(scheme=self.params["scheme"],
                                     query="",
                                     netloc=self.uripts.hostname)
     self._conn = ODPS(self.uripts.username,
                       self.uripts.password,
                       project=self.params["database"],
                       endpoint=endpoint.geturl())
Пример #15
0
 def setUp(self):
     self._project = os.environ[ODPSConfig.PROJECT_NAME]
     self._access_id = os.environ[ODPSConfig.ACCESS_ID]
     self._access_key = os.environ[ODPSConfig.ACCESS_KEY]
     self._endpoint = os.environ[ODPSConfig.ENDPOINT]
     self._test_read_table = "chicago_taxi_train_data"
     self._test_write_table = "test_odps_writer_%d_%d" % (
         int(time.time()),
         random.randint(1, 101),
     )
     self._odps_client = ODPS(self._access_id, self._access_key,
                              self._project, self._endpoint)
Пример #16
0
    def __init__(
        self,
        project,
        access_id,
        access_key,
        endpoint,
        table,
        partition=None,
        num_processes=None,
        options=None,
        transform_fn=None,
        columns=None,
    ):
        """
        Constructs a `ODPSReader` instance.

        Args:
            project: Name of the ODPS project.
            access_id: ODPS user access ID.
            access_key: ODPS user access key.
            endpoint: ODPS cluster endpoint.
            table: ODPS table name.
            tunnel_endpoint: ODPS tunnel endpoint.
            partition: ODPS table's partition.
            options: Other options passed to ODPS context.
            num_processes: Number of parallel processes on this worker.
                If `None`, use the number of cores.
            transform_fn: Customized transfrom function
            columns: list of table column names
        """
        super(ODPSReader, self).__init__()

        if table.find(".") > 0:
            project, table = table.split(".")
        if options is None:
            options = {}
        self._project = project
        self._access_id = access_id
        self._access_key = access_key
        self._endpoint = endpoint
        self._table = table
        self._partition = partition
        self._num_processes = num_processes
        _configure_odps_options(self._endpoint, options)
        self._odps_table = ODPS(
            self._access_id,
            self._access_key,
            self._project,
            self._endpoint,
        ).get_table(self._table)

        self._transform_fn = transform_fn
        self._columns = columns
Пример #17
0
    def _set_odps(self):
        if self._odps is not None:
            return

        if options.access_id is not None and \
                    options.access_key is not None and \
                    options.default_project is not None:
            self._odps = ODPS(
                options.access_id, options.access_key, options.default_project,
                endpoint=options.end_point, tunnel_endpoint=options.tunnel_endpoint
            )
        else:
            self._odps = enter().odps
Пример #18
0
def download_infos(tablename, storename, keys):
    o = ODPS("LTAIWt3hG5GvYBhX", "RriedkAIENmPvXvRmQcy9wRqOYx3QV", 'graph_embedding_intern_dev',
             endpoint='http://service-corp.odps.aliyun-inc.com/api')

    project = o.get_project()
    csv_file = open(storename, mode='w')
    writer = csv.writer(csv_file, delimiter='\t')

    tunnel = TableTunnel(o)
    download_session = tunnel.create_download_session(tablename)
    with download_session.open_record_reader(0, download_session.count) as reader:
        for record in reader:
            info = [record[key] for key in keys]
            writer.writerow(info)
    print("complete storing {}".format(storename))
Пример #19
0
    def _execute_arrow_tunnel(cls, ctx, op):
        from odps import ODPS
        from odps.tunnel import TableTunnel

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        t = o.get_table(op.table_name)
        tunnel = TableTunnel(o, project=t.project)

        if op.partition_spec is not None:
            download_session = tunnel.create_download_session(
                t.name, partition_spec=op.partition_spec)
        else:
            download_session = tunnel.create_download_session(t.name)
        logger.debug('Start reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        if op.nrows is None:
            count = op.end_index - op.start_index
        else:
            count = op.nrows

        with download_session.open_arrow_reader(op.start_index,
                                                count,
                                                columns=op.columns) as reader:
            table = reader.read()

        table = cls._append_partition_values(table, op)
        if op.string_as_binary:
            table = cls._cast_string_to_binary(table)
        data = arrow_table_to_pandas_dataframe(
            table, use_arrow_dtype=op.use_arrow_dtype)

        data = cls._align_columns(data, op.outputs[0].dtypes)

        logger.debug('Finish reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        ctx[op.outputs[0].key] = data
Пример #20
0
 def setUp(self):
     self._project = os.environ[ODPSConfig.PROJECT_NAME]
     self._access_id = os.environ[ODPSConfig.ACCESS_ID]
     self._access_key = os.environ[ODPSConfig.ACCESS_KEY]
     self._endpoint = os.environ.get(ODPSConfig.ENDPOINT)
     self._test_read_table = "test_odps_reader_%d_%d" % (
         int(time.time()),
         random.randint(1, 101),
     )
     self._test_write_table = "test_odps_writer_%d_%d" % (
         int(time.time()),
         random.randint(1, 101),
     )
     self._odps_client = ODPS(self._access_id, self._access_key,
                              self._project, self._endpoint)
     self.create_iris_odps_table()
Пример #21
0
    def execute(cls, ctx, op):
        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from cupid.io.table import CupidTableUploadSession

        if op.is_terminal:
            bearer_token = context().get_bearer_token()
            account = BearerTokenAccount(bearer_token)
            project = os.environ.get('ODPS_PROJECT_NAME', None)
            odps_params = op.odps_params.copy()
            if project:
                odps_params['project'] = project
            endpoint = os.environ.get(
                'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
            o = ODPS(None,
                     None,
                     account=account,
                     project=odps_params['project'],
                     endpoint=endpoint)
            cupid_session = CupidSession(o)

            project_name, table_name = op.table_name.split('.')
            upload_session = CupidTableUploadSession(session=cupid_session,
                                                     table_name=table_name,
                                                     project_name=project_name,
                                                     handle=op.cupid_handle,
                                                     blocks=op.blocks)
            upload_session.commit(overwrite=op.overwrite)

        ctx[op.outputs[0].key] = pd.DataFrame()
Пример #22
0
class MaxComputeConnection(Connection):
    """MaxCompute connection, this class uses ODPS object to establish
    connection with maxcompute

    Args:
        conn_uri: uri in format:
        maxcompute://access_id:[email protected]/api?curr_project=test_ci&scheme=http
    """
    def __init__(self, conn_uri):
        super().__init__(conn_uri)
        self.params["database"] = self.params["curr_project"]
        # compose an endpoint, only keep the host and path and replace scheme
        endpoint = self.uripts._replace(scheme=self.params["scheme"],
                                        query="",
                                        netloc=self.uripts.hostname)
        self._conn = ODPS(self.uripts.username,
                          self.uripts.password,
                          project=self.params["database"],
                          endpoint=endpoint.geturl())

    def _parse_uri(self):
        return super()._parse_uri()

    def _get_result_set(self, statement):
        try:
            instance = self._conn.execute_sql(statement)
            return MaxComputeResultSet(instance)
        except Exception as e:
            return MaxComputeResultSet(None, str(e))

    def close(self):
        if self._conn:
            self._conn = None
Пример #23
0
def _build_locals_in_odps(cls, ctx, op):
    from odps import ODPS
    sess = ctx.get_current_session().as_default()
    local = {'session': sess}
    if op.data is not None:
        local.update(op.data)
    o = ODPS.from_environments()
    return dict(o=o, session=sess, odps=o)
    def createTimeButtons(self, event=None):
        start_date_select = self.cal1.selection_get().strftime("%Y%m%d")
        end_date_select = self.cal2.selection_get().strftime("%Y%m%d")
        cat1_select = self.genreCombo.get()
        cat2_select = [
            self.Listbox1.get(v) for v in self.Listbox1.curselection()
        ]
        cat3_select = [
            self.Listbox2.get(a) for a in self.Listbox2.curselection()
        ]
        file_name = str(self.file_name.get())

        print(start_date_select)
        print(end_date_select)
        print(cat1_select)
        print(cat2_select)
        print(cat3_select)
        print(file_name)

        # Web Hooks

        o = ODPS('', '', '', endpoint='')
        options.tunnel.endpoint = '**API URL**'
        query = """SELECT  DISTINCT buyer_id
                        ,email_address
                        ,phone_number
                        ,venture_category1_name_en
                        ,venture_category2_name_en
                        ,venture_category3_name_en
                  FROM   Table Name AS t
                  WHERE  venture = ""
                  AND    order_status_esm NOT IN ('invalid')
                  AND    category1_name_en in ('""" + cat1_select + """')
                  AND    category2_name_en in (""" + "'" + "','".join(
            cat2_select) + "'" + """)
                  AND    category3_name_en in (""" + "'" + "','".join(
                cat3_select) + "'" + """)
                  AND    TO_CHAR(t.order_create_date,'yyyymmdd') BETWEEN '""" + start_date_select + """'
                  AND    '""" + end_date_select + """'
                  """
        # Executing the query and set the data into a dataframe
        df = o.execute_sql(query).open_reader().to_result_frame().to_pandas()
        df.columns = df.columns.str.upper()
        date_string = arrow.now().format('YYYYMMDD') + ' ' + file_name
        df.to_csv(str(str(date_string) + '.csv'), index=False)
        print(query)
Пример #25
0
    def testBearerTokenAccount(self):
        self.odps.delete_table(tn('test_bearer_token_account_table'),
                               if_exists=True)
        t = self.odps.create_table(tn('test_bearer_token_account_table'),
                                   'col string',
                                   lifecycle=1)
        with t.open_writer() as writer:
            records = [['val1'], ['val2'], ['val3']]
            writer.write(records)

        inst = self.odps.execute_sql('select count(*) from {0}'.format(
            tn('test_bearer_token_account_table')),
                                     async_=True)
        inst.wait_for_success()
        task_name = inst.get_task_names()[0]

        logview_address = inst.get_logview_address()
        token = logview_address[logview_address.find('token=') +
                                len('token='):]
        bearer_token_account = BearerTokenAccount(token=token)
        bearer_token_odps = ODPS(None,
                                 None,
                                 self.odps.project,
                                 self.odps.endpoint,
                                 account=bearer_token_account)
        bearer_token_instance = bearer_token_odps.get_instance(inst.id)

        self.assertEqual(inst.get_task_result(task_name),
                         bearer_token_instance.get_task_result(task_name))
        self.assertEqual(inst.get_task_summary(task_name),
                         bearer_token_instance.get_task_summary(task_name))

        with self.assertRaises(errors.NoPermission):
            bearer_token_odps.create_table(
                tn('test_bearer_token_account_table_test1'),
                'col string',
                lifecycle=1)

        fake_token_account = BearerTokenAccount(token='fake-token')
        bearer_token_odps = ODPS(None,
                                 None,
                                 self.odps.project,
                                 self.odps.endpoint,
                                 account=fake_token_account)

        with self.assertRaises(errors.ODPSError):
            bearer_token_odps.create_table(
                tn('test_bearer_token_account_table_test2'),
                'col string',
                lifecycle=1)
Пример #26
0
def _handle_enum_table_partitions(sock):
    try:
        cmd_len, = struct.unpack('<I', sock.recv(4))
        # dict with odps_params, table_name, partition
        task_config = pickle.loads(sock.recv(cmd_len))

        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import context

        cupid_ctx = context()

        odps_params = task_config['odps_params']
        bearer_token = cupid_ctx.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME',
                                 None) or odps_params['project']
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(None,
                 None,
                 account=account,
                 project=project,
                 endpoint=endpoint)

        table = o.get_table(task_config['table_name'])
        partition_desc = task_config.get('partition')
        if not table.schema.partitions:
            _write_request_result(sock, result=None)
        elif partition_desc:
            if check_partition_exist(table, partition_desc):
                _write_request_result(sock, result=[partition_desc])
            else:
                parts = filter_partitions(o, list(table.partitions),
                                          partition_desc)
                _write_request_result(
                    sock, result=[str(pt.partition_spec) for pt in parts])
        else:
            _write_request_result(
                sock,
                result=[str(pt.partition_spec) for pt in table.partitions])
    except:
        logger.exception('Failed to create download session')
        _write_request_result(sock, False, exc_info=sys.exc_info())
Пример #27
0
    def __init__(
        self,
        project,
        access_id,
        access_key,
        endpoint,
        table,
        columns=None,
        column_types=None,
        options=None,
    ):
        """
        Constructs a `ODPSWriter` instance.

        Args:
            project: Name of the ODPS project.
            access_id: ODPS user access ID.
            access_key: ODPS user access key.
            endpoint: ODPS cluster endpoint.
            table: ODPS table name.
            columns: The list of column names in the table,
                which will be inferred if the table exits.
            column_types" The list of column types in the table,
                which will be inferred if the table exits.
            options: Other options passed to ODPS context.
        """
        super(ODPSWriter, self).__init__()

        if table.find(".") > 0:
            project, table = table.split(".")
        if options is None:
            options = {}
        self._project = project
        self._access_id = access_id
        self._access_key = access_key
        self._endpoint = endpoint
        self._table = table
        self._columns = columns
        self._column_types = column_types
        self._odps_table = None
        _configure_odps_options(self._endpoint, options)
        self._odps_client = ODPS(
            self._access_id, self._access_key, self._project, self._endpoint
        )
Пример #28
0
def _handle_create_table_upload_session(sock):
    try:
        cmd_len, = struct.unpack('<I', sock.recv(4))
        # dict with odps_params, table_name
        session_config = pickle.loads(sock.recv(cmd_len))

        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from cupid.runtime import RuntimeContext

        if not RuntimeContext.is_context_ready():
            raise SystemError(
                'No Mars cluster found, please create via `o.create_mars_cluster`.'
            )
        cupid_ctx = context()

        odps_params = session_config['odps_params']
        bearer_token = cupid_ctx.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME',
                                 None) or odps_params['project']
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(None,
                 None,
                 account=account,
                 project=project,
                 endpoint=endpoint)
        cupid_session = CupidSession(o)

        data_src = o.get_table(session_config['table_name'])

        logger.debug('Start creating upload session from cupid.')
        upload_session = cupid_session.create_upload_session(data_src)

        ret_data = {
            'handle': upload_session.handle,
        }
        _write_request_result(sock, result=ret_data)
    except:
        logger.exception('Failed to create upload session')
        _write_request_result(sock, False, exc_info=sys.exc_info())
Пример #29
0
def uploadexcel(input_file,output_table_n='defult'):
    odps =  ODPS(ACCESS_KEY_ID,
             ACCESS_KEY_SECRET,
             PROJECT,
             endpoint='http://service.odps.aliyun.com/api')

    project = odps.get_project()  # 取到默认项目
    print(project)
    ds = datetime.datetime.now().strftime('%Y%m%d')
    print(ds)

    wb = openpyxl.load_workbook(filename=input_file,read_only=True)
    ws = wb.active
    print(datetime.datetime.now())

    output_table = odps.get_table(output_table_n)
    if output_table.exist_partition('ds=' + ds):
        output_table.delete_partition('ds=' + ds)
    output_table.create_partition('ds=' + ds, if_not_exists=True)

    tunnel = TableTunnel(odps)
    upload_session = tunnel.create_upload_session(output_table.name, partition_spec='ds=' + ds)
    print('output into', output_table_n, 'partition ds=', ds, ':\n', output_table.schema)
    index=0
    with upload_session.open_record_writer(0) as writer:
        for row in ws.rows:
            records = output_table.new_record()
            i=0
            for cell in row:
                if cell is None:
                    records[i] = None
                else:
                    records[i] = str(cell.value).encode('utf-8', "replace")
                i=i+1
            writer.write(records)
            index=index+1
            if index % 1000 == 0:
                print(index)
    upload_session.commit(0)

    print('===========')
    print(datetime.datetime.now())
Пример #30
0
    def get_connection(self, db_name=None):
        if self.conn:
            return self.conn

        db_name = db_name if db_name else self.instance.db_name

        if db_name is None:
            raise ValueError("db_name不能为空")

        self.conn = ODPS(self.user, self.password, project=db_name, endpoint=self.host)

        return self.conn
Пример #31
0
    def _set_odps(self):
        if self._odps is not None:
            return

        if options.access_id is not None and \
                    options.access_key is not None and \
                    options.default_project is not None:
            self._odps = ODPS(
                options.access_id, options.access_key, options.default_project,
                endpoint=options.end_point, tunnel_endpoint=options.tunnel_endpoint
            )
        else:
            self._odps = enter().odps
Пример #32
0
class ODPSSql(Magics):

    _odps = None

    def _set_odps(self):
        if self._odps is not None:
            return

        if options.access_id is not None and \
                    options.access_key is not None and \
                    options.default_project is not None:
            self._odps = ODPS(
                options.access_id, options.access_key, options.default_project,
                endpoint=options.end_point, tunnel_endpoint=options.tunnel_endpoint
            )
        else:
            self._odps = enter().odps

    @line_magic('enter')
    def enter(self, line):
        room = line.strip()
        if room:
            r = enter(room)
            self._odps = r.odps
        else:
            r = enter()
            self._odps = r.odps

        if 'o' not in self.shell.user_ns:
            self.shell.user_ns['o'] = self._odps

        return r

    @line_magic('setup')
    def setup(self, line):
        args = line.strip().split()
        name, args = args[0], args[1:]
        setup(*args, room=name)
        html_notify('setup succeeded')

    @line_magic('teardown')
    def teardown(self, line):
        name = line.strip()
        teardown(name)
        html_notify('teardown succeeded')

    @line_magic('list_rooms')
    def list_rooms(self, line):
        return list_rooms()

    @line_magic('stores')
    def list_stores(self, line):
        line = line.strip()

        if line:
            room = enter(line)
        else:
            room = enter()

        return room.display()

    def _get_task_percent(self, instance, task_name):
        progress = instance.get_task_progress(task_name)

        if len(progress.stages) > 0:
            all_percent = sum((float(stage.terminated_workers) / stage.total_workers)
                              for stage in progress.stages if stage.total_workers > 0)
            return all_percent / len(progress.stages)
        else:
            return 0

    def _to_stdout(cls, msg):
        print(msg)

    @line_magic('set')
    def set_hint(self, line):
        if '=' not in line:
            raise ValueError('Hint for sql is not allowed')

        key, val = line.strip().strip(';').split('=', 1)
        key, val = key.strip(), val.strip()

        settings = options.sql.settings
        if settings is None:
            options.sql.settings = {key: val}
        else:
            options.sql.settings[key] = val

    @line_cell_magic('sql')
    def execute(self, line, cell=''):
        self._set_odps()

        content = line + '\n' + cell
        content = content.strip()

        sql = None
        hints = dict()

        splits = content.split(';')
        for s in splits:
            stripped = s.strip()
            if stripped.lower().startswith('set '):
                hint = stripped.split(' ', 1)[1]
                k, v = hint.split('=', 1)
                k, v = k.strip(), v.strip()
                hints[k] = v
            elif len(stripped) == 0:
                continue
            else:
                if sql is None:
                    sql = s
                else:
                    sql = '%s;%s' % (sql, s)

        # replace user defined parameters
        sql = replace_sql_parameters(sql, self.shell.user_ns)

        if sql:
            bar = init_progress_bar()

            instance = self._odps.run_sql(sql, hints=hints)
            if options.verbose:
                stdout = options.verbose_log or self._to_stdout
                stdout('Instance ID: ' + instance.id)
                stdout('  Log view: ' + instance.get_logview_address())

            percent = 0
            while not instance.is_terminated():
                task_names = instance.get_task_names()
                last_percent = percent
                if len(task_names) > 0:
                    percent = sum(self._get_task_percent(instance, name)
                                  for name in task_names) / len(task_names)
                else:
                    percent = 0
                percent = min(1, max(percent, last_percent))
                bar.update(percent)

                time.sleep(1)

            instance.wait_for_success()
            bar.update(1)

            try:
                with instance.open_reader() as reader:
                    try:
                        import pandas as pd
                        from pandas.parser import CParserError

                        try:
                            res = pd.read_csv(StringIO(reader.raw))
                        except (ValueError, CParserError):
                            res = reader.raw
                    except ImportError:
                        try:
                            res = ResultFrame(list(reader), columns=reader._columns)
                        except TypeError:
                            res = reader.raw

                html_notify('SQL execution succeeded')
                return res
            finally:
                bar.close()

    @line_magic('persist')
    def persist(self, line):
        try:
            import pandas as pd
            has_pandas = True
        except ImportError:
            has_pandas = False

        self._set_odps()

        line = line.strip().strip(';')

        frame_name, table_name = line.split(None, 1)

        if '.' in table_name:
            project_name, table_name = tuple(table_name.split('.', 1))
        else:
            project_name = None

        frame = self.shell.user_ns[frame_name]
        if self._odps.exist_table(table_name, project=project_name):
            raise TypeError('%s already exists' % table_name)

        if isinstance(frame, DataFrame):
            frame.persist(name=table_name, project=project_name, notify=False)
        elif has_pandas and isinstance(frame, pd.DataFrame):
            frame = DataFrame(frame)
            frame.persist(name=table_name, project=project_name, notify=False)
        html_notify('Persist succeeded')
Пример #33
0
class ODPSSql(Magics):

    _odps = None

    def _set_odps(self):
        if self._odps is not None:
            return

        if options.access_id is not None and options.access_key is not None and options.default_project is not None:
            self._odps = ODPS(
                options.access_id,
                options.access_key,
                options.default_project,
                endpoint=options.end_point,
                tunnel_endpoint=options.tunnel_endpoint,
            )
        else:
            self._odps = enter().odps

    @line_magic("enter")
    def enter(self, line):
        room = line.strip()
        if room:
            r = enter(room)
            self._odps = r.odps
        else:
            r = enter()
            self._odps = r.odps

        return r

    @line_magic("setup")
    def setup(self, line):
        args = line.strip().split()
        name, args = args[0], args[1:]
        setup(*args, room=name)

    @line_magic("teardown")
    def teardown(self, line):
        name = line.strip()
        teardown(name)

    @line_magic("list_rooms")
    def list_rooms(self, line):
        return list_rooms()

    @line_magic("stores")
    def list_stores(self, line):
        line = line.strip()

        if line:
            room = enter(line)
        else:
            room = enter()

        return room.display()

    def _get_task_percent(self, instance, task_name):
        progress = instance.get_task_progress(task_name)

        if len(progress.stages) > 0:
            all_percent = sum(
                (float(stage.terminated_workers) / stage.total_workers)
                for stage in progress.stages
                if stage.total_workers > 0
            )
            return all_percent / len(progress.stages)
        else:
            return 0

    @line_cell_magic("sql")
    def execute(self, line, cell=""):
        self._set_odps()

        sql = line + "\n" + cell
        sql = sql.strip()

        if sql:
            bar = init_progress_bar()

            instance = self._odps.run_sql(sql)

            percent = 0
            while not instance.is_terminated():
                task_names = instance.get_task_names()
                last_percent = percent
                if len(task_names) > 0:
                    percent = sum(self._get_task_percent(instance, name) for name in task_names) / len(task_names)
                else:
                    percent = 0
                percent = min(1, max(percent, last_percent))
                bar.update(percent)

                time.sleep(1)

            instance.wait_for_success()
            bar.update(1)

            try:
                with instance.open_reader() as reader:
                    try:
                        import pandas as pd

                        try:
                            return pd.read_csv(StringIO(reader.raw))
                        except ValueError:
                            return reader.raw
                    except ImportError:
                        return ResultFrame(list(reader), columns=reader._columns)
            finally:
                bar.close()

    @line_magic("persist")
    def persist(self, line):
        import pandas as pd

        self._set_odps()

        line = line.strip().strip(";")

        frame_name, table_name = line.split(None, 1)

        if "." in table_name:
            project_name, table_name = tuple(table_name.split(".", 1))
        else:
            project_name = None

        frame = self.shell.user_ns[frame_name]
        if not isinstance(frame, pd.DataFrame):
            raise TypeError("%s is not a Pandas DataFrame" % frame_name)

        columns = list(frame.columns)
        types = [np_to_odps_types.get(tp, odps_types.string) for tp in frame.dtypes]

        if self._odps.exist_table(table_name, project=project_name):
            raise TypeError("%s already exists")

        tb = self._odps.create_table(table_name, Schema.from_lists(columns, types))

        def gen(df):
            size = len(df)

            bar = init_progress_bar(size)

            try:
                c = itertools.count()
                for row in df.values:
                    i = next(c)
                    if i % 50 == 0:
                        bar.update(min(i, size))

                    yield tb.new_record(list(row))

                bar.update(size)
            finally:
                bar.close()

        with tb.open_writer() as writer:
            writer.write(gen(frame))