class Regression:
    def __init__(self):
        self.consumer = Consumer('bus', 'localhost')
        self.stream = self.consumer.get_stream()
        self.kafka_stream = ConsumerKafka('bus', 'localhost')
        self.cleaned_stream = self.stream.map(self.clean_up)
        self.conf = SparkConf().setMaster('local').setAppName(
            'linear_regression')
        self.sc = SparkContext(conf=self.conf)
        self.spark = SparkSession(self.sc)

    # kafka_stream and stream are both interchangable
    def clean_up(self, data):
        essential_data = list()
        read_dictionary = np.load(os.getcwd() + '/model/d1.npy').item()
        record = json.dumps(data, separators=(',', ':'))
        values = ast.literal_eval(record)
        for i in values.get():
            rec = values.get(i)
            item = dict()
            item['stopid'] = str(i)
            counter = 0
            for j in rec:
                if j['duetime'] == 'due':
                    counter = counter + 1
            item['due_count'] = str(counter)
            item['longitude'] = read_dictionary[i][0]
            item['latitude'] = read_dictionary[i][1]
            essential_data.append(item)

    def create_data_frame(self):
        return self.spark.createDateFrame(self.cleaned_stream)

    def train_test_split(self, data):
        (train, test) = data.randomSplit([0.3, 0.7])
        return (train, test)

    def linear_regression(self, training_data):
        linear_regression = LinearRegression(maxIter=10)
        return linear_regression.fit(training_data)

    def predict(self, model, test_data):
        print('Coefficients: ' + str(model.coefficients))
        print('Intercept: ' + str(model.intercept))
        predictions = model.transform(test_data)
        return predictions.select('delay').show()
示例#2
0
class Classification:
    def __init__(self):
        self.consumer = Consumer('bus', 'localhost')
        self.kafka_stream = ConsumerKafka('bus', 'localhost')
        self.stream = self.consumer.get_stream()

    # kafka_stream and stream are both interchangable
    def logistic_regression(self):
        # read from the stream
        rdd = self.stream.filter(lambda message: float(message.temperature)) \
            .filter(lambda message: float(message.delay > 10000)) \
            .transform(lambda rdd: rdd.sortByKey())
        # select the required features
        log_reg = LogisticRegression(featuresCol = 'features', labelCol = 'delay')
        temperature_indexer = StringIndexer(inputCol = 'temperature', outputCol = 'temp_index')
        delay_encoder = OneHotEncoder(inputCol='delay', outputCol = 'delay_vector')
        pipeline = Pipeline(stages = [temperature_indexer, delay_encoder, log_reg])
        columns = rdd.select(['stop_id', 'delay', 'route_id', 'temperature'])
        train, test = columns.randomSplit([0.7, 0.3])
        fit_model = pipeline.fit(train)
        results = fit_model.transform(test)
        return results
示例#3
0
class K_Means:
    def __init__(self):
        self.spark = SparkSession.builder.appName('kmeans').getOrCreate()
        self.conf = SparkConf().setMaster('local').setAppName('kmeans')
        self.sc = SparkContext(conf=self.conf)
        self.consumer = Consumer('bus', 'localhost')
        self.stream = self.consumer.get_stream()
        self.kafka_stream = ConsumerKafka('bus', 'localhost')

    # kafka_stream and stream are both interchangable
    def kmeans(self):
        rdd = self.stream.filter(lambda message: float(message.temperature)) \
            .filter(lambda message: float(message.delay > 10000)) \
            .transform(lambda rdd: rdd.sortByKey())
        sqlContext = SQLContext(self.sc)
        schema = sqlContext.createDataFrame(rdd)
        df = schema.createOrReplaceTempView('kmeans')
        assembler = VectorAssembler(inputCols=df.columns, outputCol='features')
        final_df = assembler.transform(df)
        scaler = StandardScaler(inputCol='features',
                                outputCol='scaled_features')
        scaler_model = scaler.fit(final_df)
        return scaler_model.transform(final_df)