Пример #1
0
class Data:

    packageName = "com.brodagroup.machinelearning.common.Data"

    logger = None

    # Initializer
    def __init__(self):
        self.logger = Logger(self.packageName).getLogger()
        self.parameters = []
        return

    def configure():
        return 0

    def load(self, pathCSV):
        # dataframe = pd.read_csv(pathCSV, quotechar='"', skipinitialspace=True)
        dataframe = pd.read_csv(pathCSV)
        return dataframe

    def loadDataFrame(self, pathPKL):
        dataframe = pd.read_pickle(pathPKL)
        return dataframe

    def saveDataFrame(self, dataframe, pathPKL):
        dataframe.to_pickle(pathPKL)
        return

    def segment(self, features, target, totalPct, testingPct, randomState):

        # Use a small subset of testing hypothesis to lower run-time
        numRows, numFeatures = features.shape
        rowsUsed = int(numRows * totalPct)

        xfeatures = features[0:rowsUsed]
        xtarget = target[0:rowsUsed]

        # Note:  features represents "X" and target represents "y"
        X_train, X_test, y_train, y_test = train_test_split(
            xfeatures, xtarget, test_size=testingPct, random_state=randomState
        )

        return (X_train, X_test, y_train, y_test)

    def shuffle(self, dataset):
        np.random.shuffle(dataset)
        return

    def sample(self, dataset, count):
        sample = np.random.choice(dataset.index.values, count)
        return sample

    def normalize(self, df, type="std"):

        self.logger.info("Normalizing data, type: {0}".format(type))

        cols = df.columns.values

        inArray = df[cols].values
        outArray = None

        if type == "minmax":
            # minmax_scale = preprocessing.MinMaxScaler().fit(df[cols])
            minmax_scale = preprocessing.MinMaxScaler().fit(inArray)
            outArray = minmax_scale.transform(inArray)
        else:
            # std_scale = preprocessing.StandardScaler().fit(df[cols])
            std_scale = preprocessing.StandardScaler().fit(inArray)
            outArray = std_scale.transform(inArray)

        df = pd.DataFrame(data=outArray, columns=cols)

        return df

    def threshold(self, X, lower, lvalue, upper, uvalue):
        X[X <= lower] = lvalue
        X[X >= upper] = uvalue
        return X

    def join(self, leftDF, rightDF, onKeys):
        result = pd.merge(leftDF, rightDF, on=onKeys)
        return result

    def categorize(self, df, field):
        self.logger.debug("Categorizing field: {0}, type: {1}".format(field, df[field].dtype))

        # Create and fill new columns for the categorized field
        if df[field].dtype == "object":

            lbl = preprocessing.LabelEncoder()
            values = list(pd.Series(df[field].values.ravel()).unique())

            self.logger.debug("Field: {0}, has value count: {1}".format(field, len(values)))

            if len(values) > 2:

                for value in values:
                    # Create the new field name based upon original name and values
                    # Note -- take into account missing values
                    if pd.isnull(value):
                        xfield = field + "-" + "Missing"
                    else:
                        # Strip commas
                        xvalue = value.replace(",", "")
                        xfield = field + "-" + xvalue

                    # Create and fill in the new columns with values
                    df.loc[:, xfield] = -1
                    self.logger.debug("Creating field: {0}, type: {1}".format(xfield, df[xfield].dtype))
                    # df[xfield] = df[field].apply(lambda x: 1 if x == value else 0)
                    df.loc[:, xfield] = df.loc[:, field].apply(lambda x: 1 if x == value else 0)

                # Remove the original field
                self.logger.debug("Dropping field: {0}, type: {1}".format(field, df[field].dtype))
                df = df.drop(field, axis=1)

        return df

    def sync(self, dfA, dfB):

        self.logger.debug("Synchronizing...")

        listA = list(dfA.columns.values)
        self.logger.debug("DataFrame A, columns: {0}".format(listA))

        listB = list(dfB.columns.values)
        self.logger.debug("DataFrame B, columns: {0}".format(listB))

        setA = set(listA)
        setB = set(listB)

        columnsNotInB = setA.difference(setB)
        self.logger.debug("Columns in A but not in B: {0}".format(columnsNotInB))

        for column in columnsNotInB:
            dfB[column] = 0

        columnsNotInA = setB.difference(setA)
        self.logger.debug("Columns in B but not in A: {0}".format(columnsNotInA))

        for column in columnsNotInA:
            dfA[column] = 0

        return (dfA, dfB)

    def prune(self, df, keep=None, remove=None):
        if keep:
            self.logger.info("Pruning, keeping fields: {0}".format(keep))
            df = df[keep]

        if remove:
            self.logger.info("Pruning, removing fields: {0}".format(remove))
            df = df.drop(remove, axis=1)

        return df

    def encodeList(self, columns, dfA, dfB):

        if columns:
            lbl = preprocessing.LabelEncoder()
            for column in columns:
                dfA, dfB = self.encode(column, dfA, dfB)

        return (dfA, dfB)

    def encode(self, column, dfA, dfB):

        # Note that all input dataframes must be encoded in a similar fashion
        # and hence can not be done independently or else they will
        # get encoded based upon values present in that data set alone,
        # which is not an issue unless the values in the dataframes are
        # slightly different... for example, dataframe A (dfA) and B (dfB) both
        # have categorized values in the same specific column, but
        # dfA has values 'Y','N' and dfB has values 'maybe','sometimes','Y','N',
        # and 'almost always' then they will get encoded differently: 'Y' may be
        # encoded as '0' in dfA but '2' in dfB

        lbl = preprocessing.LabelEncoder()

        self.logger.debug("Encoding field: {0}".format(column))

        valuesA = list(dfA[column].values)
        valuesB = list(dfB[column].values)
        values = valuesA + valuesB
        lbl.fit(values)

        # self.logger.debug('Encoding field: {0}, classes: {1}'.format(column, lbl.classes_))
        # xto = lbl.transform(values)
        # xfrom = lbl.inverse_transform(xto)
        # self.logger.debug('Encoding field {0}, FROM: {1}'.format(column, xfrom))
        # self.logger.debug('Encoding field {0}, TO:   {1}'.format(column, xto))

        dfA[column] = lbl.transform(valuesA)
        dfB[column] = lbl.transform(valuesB)

        return (dfA, dfB)
Пример #2
0
class Runner():

    packageName = 'com.brodagroup.machinelearning.common.Runner'

    logger = None
    rpt = None
    gridsearchrpt = None
    featurerpt = None
    scoringrpt = None
    preprocessor = None

    # features: dataframe used for fit / learning
    features = None

    # test: dataframe used for prediction
    test = None

    # target:  dataframe (single column) of actual/correct values (for scoring)
    target = None

    # expected: dataframe (single column) of actual values (for verification that algo works)
    expected = None
    hasExpected = False

    # y_pred: array of predictions (integer)
    y_pred = None

    # yy_pred: array of prediction probabilities (float)
    yy_pred = None

    # Initializer
    def __init__(self):
        self.logger = Logger(self.packageName).getLogger()
        return

    def dumpConfiguration(self):
        pretty = json.dumps(self.configuration, sort_keys=True, indent=4)
        return(pretty)

    def configure(self, jsonstr=None, file=None, url=None, overrides=None):
        if file:
            self.logger.info('Using configuration file: {0}'.format(file))
            with open(file, encoding='utf-8') as configurationFile:
                configuration = json.loads(configurationFile.read())
        elif url:
            configuration = urllib.urlopen(url).read()
        elif jsonstr:
            configuration = json
        else:
            raise RuntimeError('Configuration not provided (json|file|url)')

        self.configuration = configuration
        self.logger.info('Using configuration: {0}'.format(self.dumpConfiguration()))

        self.override(overrides=overrides)

        classifierCode = self.configuration['classifier']
        parameters = self.configuration['parameters']

        classifierList = ClassifierList()
        classifier = classifierList.load(classifierCode, parameters)
        self.classifier = classifier

        return

    def modifyConfiguration(self, dictionary, name, value, iter):
        iter = iter + 1

        parts = name.split('.')
        name = parts[0]

        if type(dictionary[name]) is dict:
            xdict = self.configuration[name]
            if iter > 3:
                raise('Error -- too many levels in configuration')

            xname = parts[1]
            self.modifyConfiguration(xdict, xname, value, iter)
        else:
            dictionary[name] = value
            self.logger.info('Setting name: {0} to value: {1}'.format(name, value))

        return(name)

    def override(self, overrides=None):

        if overrides:
            self.logger.info('Overriding parameters: {0}'.format(overrides))
            for nvp in overrides:
                x = nvp.split(':')

                name = x[0]
                value = x[1]
                self.modifyConfiguration(self.configuration, name, value, 0)

            self.logger.info('Using new configuration: {0}'.format(self.dumpConfiguration()))

        return

    def preprocessor(self,c):
        self.logger.info('Setting preprocessor')
        self.preprocessor = c
        return

    def load(self):
        self.logger.info('Loading data')

        data = Data()

        trainCSV = self.configuration['trainCSV']
        testCSV = self.configuration['testCSV']

        featuresPKL = self.configuration['featuresPKL']
        targetPKL = self.configuration['targetPKL']
        testPKL = self.configuration['testPKL']

        expectedCSV = None
        expectedPKL = None
        try:
            expectedCSV = self.configuration['expectedCSV']
            expectedPKL = self.configuration['expectedPKL']
        except:
            pass

        # If the dataframe (pickled) file exists, then load it
        # Otherwise, load the CSV, preprocess it, and then save it as a
        # PKL file which will reduce load times
        tmpFeatures = None
        tmpTarget = None
        tmptest = None

        if( os.path.exists(featuresPKL) ):
            self.logger.info('Loading train PKL: {0}'.format(featuresPKL))
            tmpFeatures = data.loadDataFrame(featuresPKL)
            self.logger.info('Loading target PKL: {0}'.format(targetPKL))
            tmpTarget = data.loadDataFrame(targetPKL)
            self.logger.info('Loading test PKL: {0}'.format(testPKL))
            tmpTest = data.loadDataFrame(testPKL)

        else:
            self.logger.info('Loading train CSV: {0}'.format(trainCSV))
            rawtrain = data.load(trainCSV)
            self.logger.info('Loading test CSV: {0}'.format(testCSV))
            rawtest = data.load(testCSV)

            # Preprocess the data
            tmpFeatures, tmpTarget, tmpTest = self.preprocessor.execute(rawtrain, rawtest)

            # Save the dataframe (lower load times)
            self.logger.info('Saving features PKL: {0}'.format(featuresPKL))
            data.saveDataFrame(tmpFeatures, featuresPKL)
            self.logger.info('Saving target PKL: {0}'.format(targetPKL))
            data.saveDataFrame(tmpTarget, targetPKL)
            self.logger.info('Saving test PKL: {0}'.format(testPKL))
            data.saveDataFrame(tmpTest, testPKL)

        if( expectedPKL and os.path.exists(expectedPKL) ):
            self.logger.info('loading expected PKL: {0}'.format(expectedPKL))
            tmpExpected = data.loadDataFrame(expectedPKL)
            self.hasExpected = True

        elif( expectedCSV and os.path.exists(expectedCSV) ):
            self.logger.info('Loading expected CSV: {0}'.format(expectedCSV))
            tmpExpected = data.load(expectedCSV)
            self.logger.info('Saving expected PKL: {0}'.format(expectedPKL))
            data.saveDataFrame(tmpExpected, expectedPKL)
            self.hasExpected = True

        self.features = tmpFeatures
        self.target = tmpTarget
        self.test = tmpTest

        if self.hasExpected:
            self.expected = tmpExpected

        return

    # Segment the TRAINING set into a smaller
    # cross validation set of data
    def segment(self):

        self.logger.info('Segmenting...')

        data = Data();

        totalpct = float(self.configuration['totalpct'])
        testpct = float(self.configuration['testpct'])
        randomstate = int(self.configuration['randomstate'])
        X_train, X_test, y_train, y_test = data.segment(self.features, self.target, totalpct, testpct, randomstate)

        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

        return

    def fit(self):

        npXTrain = np.array(self.X_train).astype(np.float32)
        npyTrain = np.array(self.y_train).astype(np.int32)

        self.classifier.fit(npXTrain, npyTrain)
        return

    def crossvalidate(self):
        npXTrain = np.array(self.X_train).astype(np.float32)
        npyTrain = np.array(self.y_train).astype(np.int32)

        rptDF = self.classifier.crossvalidate(npXTrain, npyTrain)
        self.crossvalidationDF = rptDF
        pd.set_option('display.max_rows', 10000)
        self.logger.info('Cross Validation Report\n{0}'.format(rptDF))
        return

    def gridsearch(self, use=True, score='roc_auc'):

        self.logger.info('Executing grid search...')

        parameters = self.configuration['gridsearch']

        x = GridSearchCV(self.classifier, parameters, cv=6, scoring=score, verbose=10, n_jobs=6)
        #x = GridSearchCV(self.classifier, parameters, cv=5, scoring=score, verbose=10)

        npXTrain = np.array(self.X_train).astype(np.float32)
        npyTrain = np.array(self.y_train).astype(np.int32)
        x.fit(npXTrain, npyTrain)

        rpt = 'Grid Search Analysis \t\t' + str(dt.datetime.now())
        rpt = rpt + '\n\nParameters {0}'.format(parameters)
        rpt = rpt + '\n\nBest parameters set found:'
        rpt = rpt + '\n\t' + '{0}'.format(x.best_estimator_)
        rpt = rpt + '\n\nGrid Search Scores (using {0}):'.format(score)
        rpt = rpt + '\nSCORE\t\tSTDDEV(+/-)\tPARAMETERS:'
        for params, mean_score, scores in x.grid_scores_:
            rpt = rpt + '\n' + '{0:0.7f}\t{1:0.7f}'.format(mean_score, scores.std() / 2)
            for key in params:
                value = params[key]
                rpt = rpt + '\t\t{0}\t\t{1}'.format(key, value)

        if use:
            self.classifier = x.best_estimator_

        self.gridsearchrpt = rpt

        return(rpt)

    def importance(self):

        self.logger.info('Creating feature importance report...')

        rpt = None

        rpt = 'Feature Importance \t\t' + str(dt.datetime.now())
        if self.classifier == None:
            return(rpt)

        if hasattr(self.classifier, 'importance'):
            df = self.classifier.importance(self.X_train.columns.values)
            rpt = rpt + '\n\n{0}'.format(df)

        if hasattr(self.classifier, 'feature_importances_'):

            fi = pd.DataFrame(self.classifier.feature_importances_)
            columns = pd.DataFrame(self.X_train.columns.values)
            result = pd.concat([columns, fi], axis=1)
            result.columns = ['Feature', 'Importance']
            sorted = result.sort(['Importance','Feature'], ascending=[False, True])
            rpt = rpt + '\n{0}'.format(sorted)

        #pd.set_option('display.max_rows', len(sorted))
        #pd.reset_option('display.max_rows')
        self.featurerpt = rpt

        return(rpt)

    def score(self):

        self.logger.info('Scoring...')

        npXTest = np.array(self.X_test).astype(np.float32)

        y_pred = self.classifier.predict(npXTest)
        yy_pred = self.classifier.predict_proba(npXTest)[:,1]

        print('\n***')
        print(self.features.shape)
        print(self.test.shape)
        print('***\n')

        reportName = 'Cross Verification Data Report \t\t' + str(dt.datetime.now())

        scorer = Scorer()
        y_test = self.y_test
        rpt = scorer.score(
                y_test,
                y_pred,
                yy_pred,
                classifier=self.classifier,
                title=reportName,
                configuration=self.configuration )

        self.y_pred = y_pred
        self.yy_pred = yy_pred
        self.scoringrpt = rpt

        return(rpt)

    def inspect(self, name):
        x = getattr(self, name)
        return(x)

    def inquire(self, name):
        x = hasattr(self, name)
        return(x)

    def inject(self, name, value):
        x = setattr(self, name, value)
        return(x)

    def report(self):
        self.logger.info('Executing full report')

        rpt = '\nFull Report\n'

        if self.featurerpt:
            rpt = rpt + '\n\n{0}'.format(self.featurerpt)
        if self.scoringrpt:
            rpt = rpt + '\n\n{0}'.format(self.scoringrpt)
        if self.gridsearchrpt:
            rpt = rpt + '\n\n{0}'.format(self.gridsearchrpt)

        self.rpt = rpt

        return(rpt)

    def predict(self):
        self.logger.info('Predicting...')

        submissionSample = self.configuration['submissionSample']
        submissionDir = self.configuration['submissionDir']

        timestamp = dt.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

        submissionVFile = submissionDir + '/' + 'submission-values-' + timestamp + '.csv'
        submissionPFile = submissionDir + '/' + 'submission-probabilities-' + timestamp + '.csv'
        submissionLog = submissionDir + '/' + 'submission-' + timestamp + '.txt'

        npTest = np.array(self.test).astype(np.float32)

        y_pred = self.classifier.predict(npTest)
        yy_pred = self.classifier.predict_proba(npTest)[:,1]

        predictionrpt = None
        if self.hasExpected:

            self.logger.debug('Target is available... Scoring target')
            # The second column contains the actual values
            y_test = self.expected.iloc[:,1]

            scorer = Scorer()
            reportName = '\nTarget Data Prediction Report \t\t' + timestamp
            predictionrpt = scorer.score(
                    y_test,
                    y_pred,
                    yy_pred,
                    classifier=self.classifier,
                    title=reportName,
                    configuration=self.configuration )
            print(predictionrpt)

        sample = pd.read_csv(submissionSample)
        sample.QuoteConversion_Flag = y_pred
        sample.to_csv(submissionVFile, index=False)

        probabilities = pd.read_csv(submissionSample)
        probabilities.QuoteConversion_Flag = yy_pred
        probabilities.to_csv(submissionPFile, index=False)

        mfeatures, nfeatures= self.features.shape
        mtest, ntest = self.test.shape
        mxtrain, nxtrain= self.X_train.shape
        mxtest, nxtest = self.X_test.shape

        self.logger.debug('Saving submission information')
        with open(submissionLog, 'a') as f:
            f.write('Submission Report \t\t\t Generated at: {0}'.format(timestamp))
            f.write('\n\nData Statistics:')
            f.write('\n\Feature data: \trows: {0}, columns: {1}'.format(mfeatures, nfeatures))
            f.write('\n\tTest data: \t\trows: {0}, columns: {1}'.format(mtest, ntest))
            f.write('\n\nCross Validation Statistics:')
            f.write('\n\tTraining data: \trows: {0}, columns: {1}'.format(mxtrain, nxtrain))
            f.write('\n\tTest data: \t\trows: {0}, columns: {1}'.format(mxtest, nxtest))
            f.write('\n\nValues file:\t\t{0}'.format(submissionVFile))
            f.write('\nProbabilities file:\t{0}'.format(submissionPFile))
            f.write('\nProbabilities file:\t{0}'.format(submissionPFile))
            f.write('\n')
            f.write('{0}'.format(self.report()))

            if predictionrpt:
                f.write('\n\n{0}'.format(predictionrpt))

        return(submissionLog, submissionVFile, submissionPFile, self.classifier)