示例#1
0
def main(argv):
    global db, db_connection, entityType, featureC, targetC, predictC, metric, startTime, endTime, startTimeV, endTimeV, helpString
    get_options(argv)

    # endTime == None means now
    if endTime is None:
        endTimeV = 0
    else:
        endTimeV = ast.literal_eval(endTime)

    startTimeV = ast.literal_eval(startTime) + endTimeV

    # db_schema = None
    db = Database(credentials=credentials)
    print(db)

    # establish a native connection to db2 to store the model
    db_connection = ibm_db.connect(DB2ConnString, '', '')
    print(db_connection)

    model_store = DBModelStore(credentials['tenantId'], entityType,
                               credentials['db2']['username'], db_connection,
                               'db2')
    db.model_store = model_store

    # with open('output.json', 'w+', encoding='utf-8') as G:
    #    json.dump(db.entity_type_metadata, G)

    logger.info('Connected to database - SQL alchemy and native')

    meta = None
    try:
        meta = db.get_entity_type(entityType)
        print('Entity is ', meta)
    except Exception as e:
        logger.error('Failed to retrieve information about entityType ' +
                     str(entityType) + ' from the database because of ' +
                     str(e))

    # make sure the results of the python expression is saved to the derived metrics table
    if metric == '':
        # take the first suitable choice if there is no metric
        sourceTableName = ''
        for di in meta['dataItemDto']:
            sourceTableName = di['sourceTableName']
            if len(sourceTableName) > 0:
                break
        if len(sourceTableName) > 0:
            meta._data_items.append({
                'columnName': predictC,
                'columnType': 'NUMBER',
                'kpiFunctionId': 22856,
                'kpiFunctionDto': {
                    'output': {
                        'name': predictC
                    }
                },
                'name': predictC,
                'parentDataItemName': None,
                'sourceTableName': sourceTableName,
                'tags': {},
                'transient': True,
                'type': 'DERIVED_METRIC'
            })
        else:
            logger.error('No suitable derived metric table found')
            return
    else:
        found = False
        try:
            for di in meta['dataItemDto']:
                if di.name == metric:
                    found = True
                    predictC = di.columnName
                    break
            if not found:
                logger.error('Metric does not exist')
                return
        except Exception:
            pass

    print('Feature ', featureC, 'targets ', targetC)
    gbm = GBMRegressor(features=[featureC],
                       targets=[targetC],
                       predictions=[predictC],
                       max_depth=20,
                       num_leaves=40,
                       n_estimators=4000,
                       learning_rate=0.001)
    setattr(gbm, 'n_estimators', 4000)
    setattr(gbm, 'max_depth', 20)
    setattr(gbm, 'num_leaves', 40)
    setattr(gbm, 'learning_rate', 0.001)

    gbm.delete_existing_models = True

    logger.info('Created Regressor')

    jobsettings = {
        'db':
        db,
        '_production_mode':
        False,
        '_start_ts_override':
        (dt.datetime.utcnow() - dt.timedelta(days=startTimeV)),
        '_end_ts_override':
        (dt.datetime.utcnow() - dt.timedelta(days=endTimeV)),
        '_db_schema':
        credentials['db2']['username'],
        'save_trace_to_file':
        True
    }

    if meta is not None:
        meta._functions = [gbm]
    else:
        logger.error('No valid entity')
        return

    logger.info('Instantiated training job')

    job = pp.JobController(meta, **jobsettings)
    job.execute()

    logger.info('Model trained')

    return
示例#2
0
def main(argv):

    # entityType = 'Clients04'
    entityType = ''
    featureC = 'pressure'
    targetC = 'temperature'
    predictC = 'predict'
    startTime = None
    endTime = None
    startTimeV = dt.datetime.utcnow()
    endTimeV = dt.datetime.utcnow()
    helpString = 'train.py -E <entityType> -f <feature column> -o <target column> -p <prediction column> \
-s <starttime> -e <endtime>'

    try:
        opts, args = getopt.getopt(
            argv, "hf:t:p:s:e:E:", ["featureC=", "targetC=", "predictC=", "startTime=", "endTime=", "entityType="])
    except getopt.GetoptError:
        print(helpString)
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print(helpString)
            sys.exit()
        elif opt in ("-E", "--entityType"):
            entityType = arg
        elif opt in ("-f", "--feature"):
            featureC = arg
        elif opt in ("-t", "--target"):
            targetC = arg
        elif opt in ("-p", "--predict"):
            predictC = arg
        elif opt in ("-s", "--starttime"):
            startTime = arg
        elif opt in ("-e", "--endtime"):
            endTime = arg
    print('EntityType "', entityType)
    print('Feature Column (X) "', featureC)
    print('Target Column (Y) "', targetC)
    print('Predictor Column "', predictC)
    print('StartTime "', startTime)
    print('EndTime "', endTime)

    if entityType == '':
        print('entityType name is missing')
        print(helpString)
        sys.exit(3)

    # endTime == None means now

    if startTime == None:
        print('startTime is missing, please specify relative to endTime (-3 means 3 days before endTime)')
        print(helpString)
        sys.exit(4)
    else:
        startTimeV = dt.datetime.utcnow() - dt.timedelta(days=int(startTime))

    # db_schema = None
    db = Database(credentials=credentials)
    print(db)

    meta = db.get_entity_type(entityType)

    logger.info('Connected to database')

    est = estimator.SimpleRegressor(features=[featureC], targets=[targetC], predictions=[predictC])
    est.delete_existing_models = True
    meta._functions = [est]

    logger.info('Created Regressor')

    # make sure the results of the python expression is saved to the derived metrics table
    meta._data_items.append({'columnName': predictC, 'columnType': 'NUMBER', 'kpiFunctionId': 22856,
                             'kpiFunctionDto': {'output': {'name': predictC}},
                             'name': predictC, 'parentDataItemName': None, 'sourceTableName': 'DM_CLIENTS04',
                             'tags': {}, 'transient': True, 'type': 'DERIVED_METRIC'})

    jobsettings = {'_production_mode': False,
                   '_start_ts_override': dt.datetime.utcnow() - dt.timedelta(days=10),
                   '_end_ts_override': (dt.datetime.utcnow() - dt.timedelta(days=1)),  # .strftime('%Y-%m-%d %H:%M:%S'),
                   '_db_schema': 'BLUADMIN',
                   'save_trace_to_file': True}

    logger.info('Instantiated training job')

    job = pp.JobController(meta, **jobsettings)
    job.execute()

    logger.info('Model trained')

    return
def load_metrics_data_from_csv(entity_type_name,
                               file_path,
                               credentials=None,
                               **kwargs):
    """
    reads metrics data from csv and stores in entity type metrics table
    Note: make sure 'deviceid' and 'evt_timestamp' columns are present in csv
    'evt_timestamp' column will be inferred to be current time if None present

    :param entity_type_name: str name of entity we want to load data for
    :param file_path: str path to csv file
    :param credentials: dict analytics-service dev credentials
    :param **kwargs {
        db_schema str if no schema is provided will use the default schema
        if_exists str default:append
    }
    :return:
    """
    # load csv in dataframe
    df = pd.read_csv(file_path)

    # Map the lowering function to all column names
    # required columns are lower case
    df.columns = map(str.lower, df.columns)

    # DATABASE CONNECTION
    # :description: to access Watson IOT Platform Analytics DB.
    logger.debug('Connecting to Database')
    db = Database(credentials=credentials)
    # check if entity type table exists
    db_schema = None
    if 'db_schema' in kwargs:
        db_schema = kwargs['db_schema']
    #get the entity type to add data to
    try:
        entity_type = db.get_entity_type(entity_type_name)
    except:
        raise Exception(
            f'No entity type {entity_type_name} found.'
            f'Make sure you create entity type before loading data using csv.'
            f'Refer to create_custom_entitytype() to create the entity type first'
        )

    # find required columns
    required_cols = db.get_column_names(table=entity_type.name,
                                        schema=db_schema)
    missing_cols = list(set(required_cols) - set(df.columns))
    logger.debug(f'missing_cols : {missing_cols}')
    # Add data for missing columns that are required
    # required columns that can't be NULL {'evt_timestamp',', 'updated_utc', 'devicetype'}
    for m in missing_cols:
        if m == entity_type._timestamp:
            #get possible timestamp columns and select the first one from all candidate
            df_timestamp = df.filter(like='_timestamp')
            if not df_timestamp.empty:
                df_timestamp_columns = df_timestamp.columns
                timestamp_col = df_timestamp_columns[0]
                df[m] = pd.to_datetime(df_timestamp[timestamp_col])
                logger.debug(
                    f'Inferred column {timestamp_col} as missing column {m}')
            else:
                df[m] = dt.datetime.utcnow() - dt.timedelta(seconds=15)
                logger.debug(
                    f'Adding data: current time to missing column {m}')
        elif m == 'devicetype':
            df[m] = entity_type.logical_name
            logger.debug(
                f'Adding data: {entity_type.logical_name} to missing column {m}'
            )
        elif m == 'updated_utc':
            logger.debug(f'Adding data: current time to missing column {m}')
            df[m] = dt.datetime.utcnow() - dt.timedelta(seconds=15)
        elif m == entity_type._entity_id:
            raise Exception(f'Missing required column {m}')
        else:
            df[m] = None

    # remove columns that are not required
    df = df[required_cols]
    # write the dataframe to the database table
    db.write_frame(df=df, table_name=entity_type.name)
    logger.debug(
        f'Generated {len(df.index)} rows of data and inserted into {entity_type.name}'
    )

    # CLOSE DB CONNECTION
    db.release_resource()

    return