def generate_feature_from_data(self, inp_data):
        try:
            LOGGER.info("What features do we need")

        except Exception as e:
            LOGGER.error(traceback.format_exc())
            raise e
    def get_features_for_prediction(self, data):

        try:
            LOGGER.info("I can predict")
        except Exception:
            LOGGER.error(traceback.format_exc())
            raise
示例#3
0
    def train(self, df, target, regularization=None, num_of_iterations=100):
        try:
            LOGGER.info("Generation logistic regression")

            spark_df = self.sql_context.createDataFrame(df)
            feature_columns = spark_df.columns
            feature_columns.remove(target)

            X_train = spark_df.select(*feature_columns).map(lambda x: list(x))
            y_train = spark_df.select(target).map(lambda x: x[0])

            zipped = y_train.zip(X_train)
            train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1]))
            numOfClasses = len(df[target].unique())

            logistic_model = LogisticRegressionWithLBFGS.train(
                train_data,
                numClasses=numOfClasses,
                regParam=0,
                regType=regularization,
                intercept=True,
                iterations=num_of_iterations,
                validateData=False)

            self.model = logistic_model

        except Exception as e:
            raise e
def complete_task(fileobj):
    try:

        with fileobj.open('w') as f:
            f.write('Done')
    except Exception as e:
        LOGGER.error(traceback.format_exc())
def complete_task(fileobj):
    try:

        with fileobj.open('w') as f:
            f.write('Done')
    except Exception as e:
        LOGGER.error(traceback.format_exc())
    def get_data(self):
        try:
            LOGGER.info("Howdy data cruncher")

        except Exception as e:
            LOGGER.error(traceback.format_exc())
            raise e
    def test_train(self, df, target, train_split, test_split, regularization=None, num_of_iterations=100):
        try:
            LOGGER.info("Generation logistic regression")

            spark_df = self.sql_context.createDataFrame(df)
            feature_columns = spark_df.columns
            feature_columns.remove(target)

            train, test = spark_df.randomSplit([train_split, test_split], seed=1000000)

            X_train = train.select(*feature_columns).map(lambda x: list(x))
            y_train = train.select(target).map(lambda x: x[0])

            zipped = y_train.zip(X_train)
            train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1]))

            numOfClasses = len(df[target].unique())

            logistic_model = LogisticRegressionWithLBFGS.train(train_data,
                                                               numClasses=numOfClasses, regParam=0,
                                                               regType=regularization, intercept=True,
                                                               iterations=num_of_iterations, validateData=False)

            X_test = test.select(*feature_columns).map(lambda x: list(x))
            y_test = test.select(target).map(lambda x: x[0])

            prediction = X_test.map(lambda lp: (float(logistic_model.predict(lp))))
            prediction_and_label = prediction.zip(y_test)

            LOGGER.info(prediction_and_label.map(lambda labelAndPred: labelAndPred[0] == labelAndPred[1]).mean())
        except Exception as e:
            raise e
示例#8
0
 def train_model(self, df):
     try:
         # use this data to train the model and predict
         model = LogisticRegression()
         model.train(df=df, target='target')
         return model
     except Exception as e:
         LOGGER.error(traceback.format_exc())
 def train_model(self, df):
     try:
         # use this data to train the model and predict
         model = LogisticRegression()
         model.train(df=df, target='target')
         return model
     except Exception as e:
         LOGGER.error(traceback.format_exc())
    def train_test_model(self, df):
        try:
            # use this data to train the model and predict
            model = LogisticRegression()
            LOGGER.info("training the model")
            model.test_train(df=df, target='target', train_split=0.8, test_split=0.2)

        except Exception as e:
                LOGGER.error(traceback.format_exc())
                raise e
示例#11
0
    def get_data(self):
        try:
            conn = DBConn().get_connection()
            query = '''SELECT * from trips '''
            LOGGER.info("Reading data from db : %s" % (query))
            df = pd.read_sql(query, con=conn)
            return df

        except Exception as e:
            LOGGER.error(traceback.format_exc())
            raise e
    def get_data(self):
        try:
            conn = DBConn().get_connection()
            query = '''SELECT * from trips '''
            LOGGER.info("Reading data from db : %s" % (query))
            df = pd.read_sql(query, con=conn)
            return df

        except Exception as e:
            LOGGER.error(traceback.format_exc())
            raise e
示例#13
0
    def predict(self, df):
        try:
            LOGGER.info("Predicting using logistic regression")
            spark_df = self.sql_context.createDataFrame(df)
            feature_columns = spark_df.columns
            inp_data = spark_df.select(*feature_columns).map(lambda x: list(x))
            inp_data = spark_df.map(lambda x: list(x))
            result = self.model.predict(inp_data.map(lambda x: x)).collect()
            LOGGER.info("Predicted output is %s" % str(result))
            return result

        except Exception as e:
            raise e
示例#14
0
    def train_test_model(self, df):
        try:
            # use this data to train the model and predict
            model = LogisticRegression()
            LOGGER.info("training the model")
            model.test_train(df=df,
                             target='target',
                             train_split=0.8,
                             test_split=0.2)

        except Exception as e:
            LOGGER.error(traceback.format_exc())
            raise e
    def predict(self, df):
        try:
            LOGGER.info("Predicting using logistic regression")
            spark_df = self.sql_context.createDataFrame(df)
            feature_columns = spark_df.columns
            inp_data = spark_df.select(*feature_columns).map(lambda x: list(x))
            inp_data = spark_df.map(lambda x: list(x))
            result = self.model.predict(inp_data.map(lambda x: x)).collect()
            LOGGER.info("Predicted output is %s"%str(result))
            return result

        except Exception as e:
            raise e
示例#16
0
    def generate_target(self, df):

        try:
            LOGGER.info("generating target column")
            # generate target variables
            df.trip_count.mean()
            df["target"] = 0
            df.ix[(df.trip_count > df.trip_count.mean()), "target"] = 1
            df.target.value_counts()

            df = df.ix[:, ["hour", "day", "terminal_code", "target"]]
            return df

        except Exception as e:
            LOGGER.error(traceback.format_exc())
            raise e
    def generate_target(self, df):

        try:
            LOGGER.info("generating target column")
            # generate target variables
            df.trip_count.mean()
            df["target"] = 0
            df.ix[(df.trip_count > df.trip_count.mean()), "target"] = 1
            df.target.value_counts()

            df = df.ix[:, ["hour", "day", "terminal_code", "target"]]
            return df

        except Exception as e:
                LOGGER.error(traceback.format_exc())
                raise e
示例#18
0
    def test_train(self,
                   df,
                   target,
                   train_split,
                   test_split,
                   regularization=None,
                   num_of_iterations=100):
        try:
            LOGGER.info("Generation logistic regression")

            spark_df = self.sql_context.createDataFrame(df)
            feature_columns = spark_df.columns
            feature_columns.remove(target)

            train, test = spark_df.randomSplit([train_split, test_split],
                                               seed=1000000)

            X_train = train.select(*feature_columns).map(lambda x: list(x))
            y_train = train.select(target).map(lambda x: x[0])

            zipped = y_train.zip(X_train)
            train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1]))

            numOfClasses = len(df[target].unique())

            logistic_model = LogisticRegressionWithLBFGS.train(
                train_data,
                numClasses=numOfClasses,
                regParam=0,
                regType=regularization,
                intercept=True,
                iterations=num_of_iterations,
                validateData=False)

            X_test = test.select(*feature_columns).map(lambda x: list(x))
            y_test = test.select(target).map(lambda x: x[0])

            prediction = X_test.map(lambda lp:
                                    (float(logistic_model.predict(lp))))
            prediction_and_label = prediction.zip(y_test)

            LOGGER.info(
                prediction_and_label.map(lambda labelAndPred: labelAndPred[0]
                                         == labelAndPred[1]).mean())
        except Exception as e:
            raise e
示例#19
0
def get_categorical_codes(data_df, categorical_columns):
    try:
        for column in categorical_columns:
            print(column)
            data_df[column] = data_df[column].str.strip()
            data_df[column] = data_df[column].str.lower()
            data_df[column] = data_df[column].astype('category')

        modified_string = '_code'
        categorical_columns_modified = [ column + modified_string for column in categorical_columns]

        data_df[categorical_columns_modified] = data_df[categorical_columns].apply(lambda x: x.cat.codes)

        return data_df

    except Exception as e:
        LOGGER.error(traceback.format_exc())
        raise e
示例#20
0
    def get_features_for_prediction(self, data):

        try:
            json_data = json.dumps(data)
            df = pd.read_json(json_data)
            LOGGER.info("feature set is")
            LOGGER.info(df.head())

            return df
        except Exception:
            LOGGER.error(traceback.format_exc())
            raise
    def run(self):

        try:
            LOGGER.info("starting load trip task")
            bike_share = BayBikeShare()
            bike_share.load_data()

            LOGGER.info("Load trip data complete")
            complete_task(self.output())

        except Exception as e:
            LOGGER.error(traceback.format_exc())
    def get_features_for_prediction(self, data):

        try:
            json_data = json.dumps(data)
            df = pd.read_json(json_data)
            LOGGER.info("feature set is")
            LOGGER.info(df.head())

            return df
        except Exception:
            LOGGER.error(traceback.format_exc())
            raise
    def persist(self, location):
        try:
            LOGGER.info("Writing the model to location %s"%location)
            data = 'data'
            meta_data = 'metadata'

            data_location = os.path.join(location, data)
            if os.path.exists(data_location):
                LOGGER.info("Removing directory %s"%data_location)
                shutil.rmtree(data_location)

            data_location = os.path.join(location, meta_data)
            if os.path.exists(data_location):
                LOGGER.info("Removing directory %s"%data_location)
                shutil.rmtree(data_location)

            self.model.save(self.sc, location)
        except Exception as e:
            raise e
示例#24
0
    def persist(self, location):
        try:
            LOGGER.info("Writing the model to location %s" % location)
            data = 'data'
            meta_data = 'metadata'

            data_location = os.path.join(location, data)
            if os.path.exists(data_location):
                LOGGER.info("Removing directory %s" % data_location)
                shutil.rmtree(data_location)

            data_location = os.path.join(location, meta_data)
            if os.path.exists(data_location):
                LOGGER.info("Removing directory %s" % data_location)
                shutil.rmtree(data_location)

            self.model.save(self.sc, location)
        except Exception as e:
            raise e
    def run(self):

        try:
            LOGGER.info("starting terminal traffic train task")
            terminal_traffic = TerminalTraffic()
            LOGGER.info("get traffic terminal data")
            data = terminal_traffic.get_data()

            LOGGER.info("generate features for terminal traffic")
            df = terminal_traffic.generate_feature_from_data(inp_data=data)
            LOGGER.info("generate target for terminal traffic")
            df = terminal_traffic.generate_target(df)

            LOGGER.info("train traffic model")
            model = terminal_traffic.train_model(df)
            self.set_path()
            dir = os.getcwd()
            main_dir = os.path.join(dir, self.main_directory,  self.models_directory, self.name)
            LOGGER.info("persisting predictive model")
            model.persist(location=main_dir)

            complete_task(self.output())

        except Exception:
            LOGGER.error(traceback.format_exc())
示例#26
0
    def generate_feature_from_data(self, inp_data):
        try:
            LOGGER.info("Generating features from data")
            LOGGER.info("Input data has the shape %s" % str(inp_data.shape))

            inp_data['start_hour'] = inp_data["Start Date"].apply(
                mlUtils.get_hour)
            inp_data['start_day'] = inp_data["Start Date"].apply(
                mlUtils.get_day)

            inp_data['end_hour'] = inp_data["End Date"].apply(mlUtils.get_hour)
            inp_data['end_day'] = inp_data["End Date"].apply(mlUtils.get_day)

            LOGGER.info(inp_data.head())

            #now lets find the count traffic for an hour given a day of the week and terminal
            start_df = inp_data.groupby(
                by=["start_hour", "start_day", "Start Terminal"
                    ]).count().copy()
            start_df = start_df.reset_index()

            LOGGER.info(start_df.head())

            LOGGER.info("creating start df")
            # getting only the required columns
            start_df = start_df.ix[:, [
                "start_hour", "start_day", "Start Terminal", "Trip ID"
            ]]
            start_df.columns = ["hour", "day", "terminal_code", "trip_id"]
            start_df.head()

            LOGGER.info("creating end df")
            end_df = inp_data.groupby(
                by=["end_hour", "end_day", "End Terminal"]).count().copy()
            end_df = end_df.reset_index()
            end_df = end_df.ix[:, [
                "end_hour", "end_day", "End Terminal", "Trip ID"
            ]]
            end_df.columns = ["hour", "day", "terminal_code", "trip_id"]
            LOGGER.info(end_df.head())

            LOGGER.info("merging start and end df")
            # merge start and end data frames to generate traffic counts for a terminal
            merged_df = start_df.merge(end_df,
                                       how="inner",
                                       on=["hour", "day", "terminal_code"])

            merged_df[
                "trip_count"] = merged_df["trip_id_x"] + merged_df["trip_id_y"]
            merged_df = merged_df.ix[:, [
                "hour", "day", "terminal_code", "trip_count"
            ]]

            return merged_df

        except Exception as e:
            LOGGER.error(traceback.format_exc())
            raise e
    def generate_feature_from_data(self, inp_data):
        try:
            LOGGER.info("Generating features from data")
            LOGGER.info("Input data has the shape %s" % str(inp_data.shape))

            inp_data['start_hour'] = inp_data["Start Date"].apply(mlUtils.get_hour)
            inp_data['start_day'] = inp_data["Start Date"].apply(mlUtils.get_day)

            inp_data['end_hour'] = inp_data["End Date"].apply(mlUtils.get_hour)
            inp_data['end_day'] = inp_data["End Date"].apply(mlUtils.get_day)

            LOGGER.info(inp_data.head())

            #now lets find the count traffic for an hour given a day of the week and terminal
            start_df = inp_data.groupby(by=["start_hour", "start_day", "Start Terminal"]).count().copy()
            start_df = start_df.reset_index()

            LOGGER.info(start_df.head())

            LOGGER.info("creating start df")
            # getting only the required columns
            start_df = start_df.ix[:, ["start_hour", "start_day", "Start Terminal", "Trip ID"]]
            start_df.columns = ["hour", "day", "terminal_code", "trip_id"]
            start_df.head()


            LOGGER.info("creating end df")
            end_df = inp_data.groupby(by=["end_hour", "end_day", "End Terminal"]).count().copy()
            end_df = end_df.reset_index()
            end_df = end_df.ix[:, ["end_hour", "end_day", "End Terminal", "Trip ID"]]
            end_df.columns = ["hour", "day", "terminal_code", "trip_id"]
            LOGGER.info(end_df.head())

            LOGGER.info("merging start and end df")
            # merge start and end data frames to generate traffic counts for a terminal
            merged_df = start_df.merge(end_df, how="inner", on=["hour", "day", "terminal_code"])


            merged_df["trip_count"] = merged_df["trip_id_x"] + merged_df["trip_id_y"]
            merged_df = merged_df.ix[:, ["hour", "day", "terminal_code", "trip_count"]]

            return merged_df

        except Exception as e:
                LOGGER.error(traceback.format_exc())
                raise e
 def train_test_model(self, df):
     try:
         LOGGER.info("Train and test people, train and test")
     except Exception as e:
         LOGGER.error(traceback.format_exc())
         raise e
 def train_model(self, df):
     try:
         LOGGER.info("I love machine learning")
     except Exception as e:
         LOGGER.error(traceback.format_exc())
    def train(self, df, target, regularization=None, num_of_iterations=100):
        try:
            LOGGER.info("Generation logistic regression")

            spark_df = self.sql_context.createDataFrame(df)
            feature_columns = spark_df.columns
            feature_columns.remove(target)


            X_train = spark_df.select(*feature_columns).map(lambda x: list(x))
            y_train = spark_df.select(target).map(lambda x: x[0])

            zipped = y_train.zip(X_train)
            train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1]))
            numOfClasses = len(df[target].unique())

            logistic_model = LogisticRegressionWithLBFGS.train(train_data,
                                                               numClasses=numOfClasses, regParam=0,
                                                               regType=regularization, intercept=True,
                                                               iterations=num_of_iterations, validateData=False)


            self.model = logistic_model

        except Exception as e:
            raise e