Exemplo n.º 1
0
    def _create_flow(self, flow_name, *tasks, deps):
        flow = Flow(flow_name, tasks, deps)

        flow_repodir = os.path.join(self.context.workdir, "flows")
        flow_workdir = os.path.join(flow_repodir, flow_name)
        os.makedirs(flow_workdir, exist_ok=True)
        for task in tasks:
            job_file = os.path.join(flow_workdir, task + ".job")
            with open(job_file, 'w') as f:
                f.write("type=command\n")
                if task in deps and len(deps[task]) > 0:
                    f.write("dependencies=" + ','.join(deps[task]) + "\n")
                f.write("command=" + self.cmd.format(task=task))
                f.flush()

        if len(flow.forest) > 1:
            job_file = os.path.join(flow_workdir, flow_name + ".job")
            with open(job_file, 'w') as f:
                f.write("type=command\n")
                f.write("dependencies=" + ','.join(flow.forest) + "\n")
                f.write("command=echo flow done\n")
                f.write("failure.emails=" + self.notify_mails)
                f.flush()
        logger.debug("Job files generation succeed")

        import zipfile
        def zipdir(path, ziph):
            # ziph is zipfile handle
            for root, dirs, files in os.walk(path):
                for file in files:
                    ziph.write(os.path.join(root, file))

        job_zip = os.path.join(flow_repodir, flow_name + ".zip")
        zipf = zipfile.ZipFile(job_zip, 'w', zipfile.ZIP_DEFLATED)
        zipdir(flow_workdir, zipf)
        zipf.close()

        logger.debug("Job files zipped into {}".format(job_zip))

        files = {
            'file': (flow_name + '.zip', open(job_zip, 'rb'), 'application/zip', {'Expires': '0'})
        }
        self._call_api('manager', 'upload', require_login=True, method='POST', attachment=files, project=self.project)

        import shutil
        shutil.rmtree(flow_workdir, ignore_errors=True)
        os.remove(job_zip)

        logger.info("Azkaban flow {} updated, you can go to {} to check".format(flow_name,
                                                                                self.host + "/manager?project=" + self.project + "&flow=" + flow_name))
Exemplo n.º 2
0
    def load_query(self, query, db, **kwargs):
        conn = self.open(db)
        df = pd.read_sql_query(query, con=conn)

        logger.info("before memory: " +
                    str(df.memory_usage(deep=True).sum() / 1024**2) + " MB")
        # 优化内存使用
        # 1.使用子类型优化数字列
        df_int = df.select_dtypes(include=['int64'])
        convert_int = df_int.apply(pd.to_numeric, downcast='unsigned')
        for col in df_int.columns:
            df[col] = convert_int[col]

        # 2.使用分类来优化对象类型
        df_obj = df.select_dtypes(include=['object'])
        if len(df_obj.columns) > 0:
            for col in df_obj.columns:
                num_unique_values = len(df_obj[col].unique())
                num_total_values = len(df_obj[col])
                if num_total_values > 0 and num_unique_values / num_total_values < 0.5:
                    df[col] = df[col].astype('category')
        logger.info("after memory: " +
                    str(df.memory_usage(deep=True).sum() / 1024**2) + " MB")
        return df
Exemplo n.º 3
0
    def store(self, df, table, db, **kwargs):
        assert isinstance(df, pd.DataFrame), "Invalid data type"
        if_exists = kwargs.get('if_exists', 'fail')
        chunksize = kwargs.get('chunksize', 10000)
        pkey = kwargs.get('pkey', None)
        indexes = kwargs.get('indexes', [])
        checkpoint_column = kwargs.get('checkpoint_column', None)
        checkpoint = kwargs.get('checkpoint')
        last_checkpoint = kwargs.get('last_checkpoint')

        _conn = self.open(db)

        try:
            if if_exists == 'append' or if_exists == 'update':
                target_table = Table(table,
                                     MetaData(),
                                     autoload=True,
                                     autoload_with=_conn)
                assert checkpoint_column is not None, "checkpoint_column is required in update mode!"
                assert (
                    isinstance(checkpoint_column, tuple)
                    and len(checkpoint_column) == 2) or isinstance(
                        checkpoint_column,
                        str), "checkpoint_column can only be str or 2-tuple!"

                if isinstance(checkpoint_column, tuple):
                    (create_time_column,
                     update_time_column) = checkpoint_column
                else:
                    create_time_column = checkpoint_column
                    update_time_column = checkpoint_column

                # delete extra records over last checkpoint in append/update mode
                clear_ins = target_table.delete().where(
                    Column(update_time_column) >= last_checkpoint)
                _conn.execute(clear_ins)

                if if_exists == 'update':
                    assert pkey is not None, "primary key is required in update mode!"
                    assert isinstance(
                        pkey,
                        str), "update mode only support single primary key"
                    update_df = df[df[create_time_column] < last_checkpoint]
                    if not update_df.empty:
                        logger.info(table +
                                    ": find {} records to update".format(
                                        len(update_df)))
                        update_keys = list(update_df[pkey])
                        delete_ins = target_table.delete().where(
                            Column(pkey).in_(update_keys))
                        _conn.execute(delete_ins)
                    if_exists = 'append'
        except NoSuchTableError:
            if_exists = 'replace'

        schema = None
        if table.find('.') >= 0:
            toks = table.split('.', 1)
            schema = toks[0]
            table = toks[1]

        float_columns = list(
            df.select_dtypes(include=['float64', 'float']).keys())
        if len(float_columns) > 0:
            logger.warn(
                table +
                ": Detect columns with float types {}, you better check if this is caused by NAN-integer "
                "column issue of pandas!".format(list(float_columns)))

        typehints = dict()
        obj_columns = list(df.select_dtypes(include=['object']).keys())

        if len(obj_columns) > 0:
            logger.warn(
                table +
                ": Detect columns with object types {}, which is automatically converted to *VARCHAR(256)*, "
                "you can override this by specifying type hints!".format(
                    list(obj_columns)))
        import sqlalchemy.types as sqltypes
        typehints.update(dict((k, sqltypes.VARCHAR(256)) for k in obj_columns))

        # TODO: upddate typehints with user-specified one
        _typehints = kwargs.get('typehints', {})
        from parade.type import stdtype_to_sqltype
        for col, stdtype in _typehints.items():
            logger.info(
                table +
                ": Column [{}] is set to type [{}]".format(col, str(stdtype)))
            typehints[col] = stdtype_to_sqltype(stdtype)

        def _chunks(_df, _chunksize):
            """Yield successive n-sized chunks from l."""
            for i in range(0, len(_df), _chunksize):
                yield df[i:i + _chunksize]

        # still write to database for empty dataframe
        if df.empty:
            df.to_sql(name=table,
                      con=_conn,
                      index=False,
                      schema=schema,
                      if_exists=if_exists,
                      dtype=typehints)
            logger.warn(table + ": Write to {}: empty dataframe".format(table))
        else:
            for idx, chunk in enumerate(_chunks(df, chunksize)):
                if_exists_ = 'append' if idx > 0 else if_exists
                chunk.to_sql(name=table,
                             con=_conn,
                             index=False,
                             schema=schema,
                             if_exists=if_exists_,
                             dtype=typehints)
                logger.info(table + ": Write to {}: rows #{}-#{}".format(
                    table, idx * chunksize, (idx + 1) * chunksize))

        if if_exists == 'replace':
            if pkey:
                pkeys = pkey if isinstance(pkey, str) else ','.join(pkey)
                _conn.execute('ALTER TABLE {} ADD PRIMARY KEY ({})'.format(
                    table, pkeys))

            for index in indexes:
                index_str = index if isinstance(index,
                                                str) else ','.join(index)
                index_name = index if isinstance(index,
                                                 str) else '_'.join(index)
                _conn.execute('ALTER TABLE {} ADD INDEX idx_{} ({})'.format(
                    table, index_name, index_str))