예제 #1
0
def loadModelFromDatabase(columnName, station_id):
    cluster = Cluster(['192.168.246.236'])
    session = cluster.connect("dev")
    name = str(station_id + "__" + columnName)
    query = "SELECT model FROM linear_model WHERE name=%s"
    rows = session.execute(query, parameters=[(name)])
    # rows = session.execute('SELECT model FROM linear_model WHERE name=\"'+name+'\"')
    if (rows):
        for row in rows:
            loadedCustomModel = pickle.loads(row[0])
            loadedModel = loadedCustomModel.getModel()

            lrModel = TrainValidationSplitModel(loadedModel)
            return row[0]
예제 #2
0
    def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)
        tRatio = self.getOrDefault(self.trainRatio)
        seed = self.getOrDefault(self.seed)
        randCol = self.uid + "_rand"
        df = dataset.select("*", rand(seed).alias(randCol))
        metrics = [0.0] * numModels
        condition = (df[randCol] >= tRatio)
        train_fold = self.train_fold
        test_fold = self.test_fold

        df = df.sort(df.id.asc())
        dfp = df.toPandas()
        dfp = np.array_split(dfp, train_fold + test_fold)
        train = self.spark.createDataFrame(data=dfp[0].round(3))
        for i in range(1, train_fold):
            p = self.spark.createDataFrame(data=dfp[i].round(3))
            train = train.union(p)
        validation = self.spark.createDataFrame(data=dfp[-1].round(3))
        for j in range(-2, -test_fold - 1, -1):
            q = self.spark.createDataFrame(data=dfp[j].round(3))
            validation = validation.union(q)
        validation = validation.sort(validation.id.asc())
        train = train.sort(train.id.asc())

        # train.select(train.id).show(14000)
        # print('#######################################################################')
        # validation.select(validation.id).show(14000)

        models = est.fit(train, epm)
        for j in range(numModels):
            model = models[j]
            metric = eva.evaluate(model.transform(validation, epm[j]))
            metrics[j] += metric
        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)
        bestModel = est.fit(dataset, epm[bestIndex])
        return self._copyValues(TrainValidationSplitModel(bestModel, metrics))