def __init__(self,
              series=None,
              n_preds=672,
              n_weeks=5,
              slen=672,
              alpha=0.816,
              beta=0.0001,
              gamma=0.993,
              data_file="access_Point_1_incoming.csv"):
     self.default_series = series
     self.default_stride_length = slen
     self.default_alpha = alpha
     self.default_beta = beta
     self.default_gamma = gamma
     self.default_num_predictions = n_preds
     self.default_num_train_weeks = n_weeks
     self.data_column_name = ""
     self.csvWriter = CsvWriter(host="",
                                port=0,
                                username="",
                                password="",
                                database="",
                                new_measurement="",
                                new_cvs_file_name="")
     self.returned_data_frame = self.csvWriter.csv_file_to_dataframe(
         new_filepath=path.join(RESOURCES_DIR, data_file), new_row_start=0)
 def __init__(self, database="predicted_data"):
     self._default_stride = Stride.WEEKLY
     self._num_of_series = 8
     self._selected_model = None
     self._data_writer = CsvWriter(host=db_config.host,
                                   port=db_config.port,
                                   username=db_config.username,
                                   password=db_config.password,
                                   database=database)
 def __init__(self, predicted, data_file="access_Point_1_incoming.csv"):
     self.csvWriter = CsvWriter(host="",
                                port=0,
                                username="",
                                password="",
                                database="",
                                new_measurement="",
                                new_cvs_file_name="")
     self.predictedValues = predicted
     self.actualValues = self.csvWriter.csv_file_to_dataframe_date_selection(
         path.join(RESOURCES_DIR, data_file), pd.Timestamp(predicted[0, 0]),
         pd.Timestamp(predicted[-1, 0]))
 def __init__(self, config_object=None):
     if isinstance(config_object, GeneratorConfig):
         self._Config = config_object
     else:
         self._Config = GeneratorConfig()
     self._Columns = 'avg_hrcrx_max_byt'
     self._data_writer = CsvWriter(host=db_config.host,
                                   port=db_config.port,
                                   username=db_config.username,
                                   password=db_config.password,
                                   database=self._Config.Database)
     self.Dist_Array = self._Config.Func_Type.generate()
class ErrorAnalysis:
    def __init__(self, predicted, data_file="access_Point_1_incoming.csv"):
        self.csvWriter = CsvWriter(host="",
                                   port=0,
                                   username="",
                                   password="",
                                   database="",
                                   new_measurement="",
                                   new_cvs_file_name="")
        self.predictedValues = predicted
        self.actualValues = self.csvWriter.csv_file_to_dataframe_date_selection(
            path.join(RESOURCES_DIR, data_file), pd.Timestamp(predicted[0, 0]),
            pd.Timestamp(predicted[-1, 0]))

    def compute_error(self):
        actual = self.actualValues["avg_hrcrx_max_byt"].tolist()
        predicted = self.predictedValues[:, 1].tolist()
        count = 0

        self.meanSquaredError = metrics.mean_squared_error(actual, predicted)
        self.meanAbsoluteError = metrics.mean_absolute_error(actual, predicted)

        print("Mean Squared Error: " + str(self.meanSquaredError))
        print("Mean Absolute Error: " + str(self.meanAbsoluteError))

    def plot_predicted_vs_actual(self):
        pyplot.plot(self.predictedValues[:, 0], self.predictedValues[:, 1])
        pyplot.plot(np.array(self.actualValues[""].tolist()),
                    np.array(self.actualValues["avg_hrcrx_max_byt"].tolist()))
        pyplot.legend(['Predicted Values', 'Actual Values'])
        pyplot.show()
示例#6
0
    def __init__(self,
                 default_stride=Stride.WEEKLY,
                 window_length=8,
                 data_file="access_Point_1_incoming.csv"):
        self.defaultStride = default_stride
        self.windowLength = window_length
        self.csvWriter = CsvWriter(host="",
                                   port=0,
                                   username="",
                                   password="",
                                   database="",
                                   new_measurement="",
                                   new_cvs_file_name="")

        # Data returned as two columns. One with timeseries and other with bytecount values
        self.returned_data_frame = self.csvWriter.csv_file_to_dataframe(
            new_filepath=path.join(RESOURCES_DIR, data_file),
            new_row_start=0,
            new_row_end=self.defaultStride.value * self.windowLength)
示例#7
0
 def __init__(self, database='generated_data'):
     self.gen_config = GeneratorConfig()
     # Don't run default generator on init
     self.generator = None
     self._selected_model = None
     # is this line needed?
     self._data_writer = CsvWriter(host=db_config.host,
                                   port=db_config.port,
                                   username=db_config.username,
                                   password=db_config.password,
                                   database=database)
 def call_model(self):
     #This function returns a numpy array of timestamps and forecasted data, it call also return observed values
     writer = CsvWriter(host=db_config.host, port=db_config.port, username=db_config.username,
                        password=db_config.password, database='predicted_data')
     #row_end = self.default_rtu + 672
     df = writer.csv_file_to_dataframe(new_filepath=self.default_csv_filename, new_row_end=self.default_rtu, usecols=[0, 1])
     #if self.default_rtu is None:
         #self.default_rtu = df.shape[0]
     series = list(df.values.flatten())
     if self.default_rtu is None:
         last_time_stamp = series[-2]
     else:
         last_time_stamp = series[(self.default_rtu-1)*2]
     bytcts = series[1::2][:self.default_rtu]
     self.default_series = bytcts
     smooth_series = self.exponential_smoothing(self.default_series, self.default_alpha)
     result_datetimes = pd.date_range(last_time_stamp, periods=672+1, freq='15min')[1:]
     print(len(result_datetimes))
     nparray_data = np.array([result_datetimes, smooth_series]).transpose()
     self.data_column_name = df.columns[1]
     return nparray_data
class Generator:
    def __init__(self, config_object=None):
        if isinstance(config_object, GeneratorConfig):
            self._Config = config_object
        else:
            self._Config = GeneratorConfig()
        self._Columns = 'avg_hrcrx_max_byt'
        self._data_writer = CsvWriter(host=db_config.host,
                                      port=db_config.port,
                                      username=db_config.username,
                                      password=db_config.password,
                                      database=self._Config.Database)
        self.Dist_Array = self._Config.Func_Type.generate()

    def nparray_to_dataframe(self):
        indexes = pd.DataFrame(self.Dist_Array[:, 0])
        indexes[0] = pd.to_datetime(indexes[0], format='%Y-%m-%d %H:%M:%S')
        cols = [self._Columns]
        df = pd.DataFrame(data=self.Dist_Array[0:, 1:],
                          index=indexes[0],
                          columns=cols)
        return df

    def write_data_to_csv(self):
        df = self.nparray_to_dataframe()
        model_name = self._Config.Func_Type.Name
        if not isinstance(df, pd.DataFrame):
            print("Error reading the data from database.")
        df.to_csv(path.join(RESOURCES_DIR, model_name + "_generated.csv"))

    def write_data_to_database(self):
        df = self.nparray_to_dataframe()
        model_name = self._Config.Func_Type.Name
        df.to_csv(path.join(RESOURCES_DIR, model_name + "_generated.csv"))
        self._data_writer.csv_file_to_db(
            measurement_to_use=model_name + '_generated',
            new_csv_file_name=path.join(RESOURCES_DIR,
                                        model_name + "_generated.csv"))
        remove(path.join(RESOURCES_DIR, model_name + "_generated.csv"))
示例#10
0
 def test_csv_file_to_data(self):
     try:
         data_return_path = RESOURCES_DIR + "/" + 'temp2.csv'
         initial_data_path = RESOURCES_DIR + "/" + 'temp.csv'
         test_csv_write3 = CsvWriter(self.host, self.port, self.username, self.password, self.database)
         test_csv_write3.csv_file_to_db()
         test_csv_write3.data_to_csv_file('select * from per15min', new_csv_file_name=data_return_path, fillGaps=False)
         self.assertTrue(self.compare_csv_files(data_return_path, initial_data_path),
                         "Integration test failed, file is not the same")
         os.remove(data_return_path)
     except ConnectionError as error:
         print("Test: Failed - {0}\n".format(error))
示例#11
0
class TestCsvDf(TestCase):
    filepath = os.path.join(RESOURCES_DIR,'temp.csv')
    start = 0
    end = 4
    dlt = False
    csvWriter = CsvWriter(host="", port=0, username="", password="", database="", new_measurement="",
                          new_cvs_file_name="")

    def test_csv_to_df(self):
        df = self.csvWriter.csv_file_to_dataframe(new_filepath=self.filepath, new_row_start=self.start, new_row_end=self.end, delete=self.dlt, usecols=[0,1])
        compare_file = os.path.join(RESOURCES_DIR,'compare.csv')
        df.to_csv(compare_file)
        with open(self.filepath) as f1:
           next(f1)
           string_one = next(f1)
           sone = string_one
        with open(compare_file) as f2:
           next(f2)
           string_two = next(f2)
           stwo = string_two.lstrip('0,/')

        os.remove(compare_file)
        self.assertEqual(sone, stwo, "The specified function test failed, not equal")

    def test_invalid_parameters(self):
        with self.assertRaises(FileNotFoundError):
            df = csv_to_dataframe(filepath="aqqa", row_start=self.start, row_end=self.end, dlt=self.dlt, usecols=[0,1])

        with self.assertRaises(StopIteration):
            self.start=100000000
            df = csv_to_dataframe(filepath=self.filepath, row_start=self.start, row_end=self.end, dlt=self.dlt, usecols=[0,1])

    def test_csv_data_frame(self):
        a = csv_to_dataframe_date_selection(file_path=self.filepath,usecols=[0, 1], start_date=pd.Timestamp("2017-03-18 00:15:00"), end_date=pd.Timestamp("2017-03-18 00:30:00"))
        self.assertIsNotNone(a)
        self.assertEquals(2, len(a))
        self.assertEquals(875, (np.array(a)[0, 1]))
        self.assertEquals(894, np.array(a)[1, 1])
class HoltWinters:
    def __init__(self,
                 series=None,
                 n_preds=672,
                 n_weeks=5,
                 slen=672,
                 alpha=0.816,
                 beta=0.0001,
                 gamma=0.993,
                 data_file="access_Point_1_incoming.csv"):
        self.default_series = series
        self.default_stride_length = slen
        self.default_alpha = alpha
        self.default_beta = beta
        self.default_gamma = gamma
        self.default_num_predictions = n_preds
        self.default_num_train_weeks = n_weeks
        self.data_column_name = ""
        self.csvWriter = CsvWriter(host="",
                                   port=0,
                                   username="",
                                   password="",
                                   database="",
                                   new_measurement="",
                                   new_cvs_file_name="")
        self.returned_data_frame = self.csvWriter.csv_file_to_dataframe(
            new_filepath=path.join(RESOURCES_DIR, data_file), new_row_start=0)

    # This method is what is called by the PFramework to display and allow the the parameters
    # used to be set to something other than the default.
    def set_parameters(self):
        """
        Asking user to change a parameters specific to a model, if needed
        :return:
        """
        print("The default number of datapoints to predict: {}".format(
            self.default_num_predictions))
        print("The default number of  training weeks: {}".format(
            self.default_num_train_weeks))
        print("The default seasonal stride length: {}".format(
            self.default_stride_length))
        print("The default alpha value: {}".format(self.default_alpha))
        print("The default beta value: {}".format(self.default_beta))
        print("The default gamma value: {}".format(self.default_gamma))
        print(
            "Would you like to set the parameters for Holt-Winters first? [y]/[n]"
        )
        selection = input("Prompt: ")
        if selection.lower() == 'y':
            print("Choose the number of datapoints to predict")
            selection = input("Prompt: ")
            self.default_num_predictions = int(selection)

            print("Choose the number of training weeks")
            selection = input("Prompt: ")
            self.default_num_train_weeks = int(selection)

            print("Choose the seasonal stride length")
            selection = input("Prompt: ")
            self.default_stride_length = int(selection)

            print("Choose the desired alpha value")
            selection = input("Prompt: ")
            self.default_alpha = float(selection)

            print("Choose the desired beta value")
            selection = input("Prompt: ")
            self.default_beta = float(selection)

            print("Choose the desired gamma value")
            selection = input("Prompt: ")
            self.default_gamma = float(selection)

    # This finds the average trend across all seasonal values to used as
    # the initial trend for the model
    def initial_trend(Self, series, slen):
        sum = 0.0
        for i in range(slen):
            sum += float(series[i + slen] - series[i]) / slen
        return sum / slen

    # This calculates the initial seasonal values corresponding to each observed season.
    # A explanation of the reasoning for this can be found
    # here: http://www.itl.nist.gov/div898/handbook/pmc/section4/pmc435.htm
    def initial_seasonal_components(Self, series, slen):
        seasonals = {}
        season_averages = []
        n_seasons = int(len(series) / slen)
        # compute season averages
        for j in range(n_seasons):
            season_averages.append(
                sum(series[slen * j:slen * j + slen]) / float(slen))
        # compute initial values
        for i in range(slen):
            sum_of_vals_over_avg = 0.0
            for j in range(n_seasons):
                sum_of_vals_over_avg += series[slen * j +
                                               i] - season_averages[j]
            seasonals[i] = sum_of_vals_over_avg / n_seasons
        return seasonals

    # This includes the full algorithm for implementing Holt-Winters.
    # We can assume that the all factors (alpha, beta and gamma) have been defined
    def triple_exponential_smoothing(self, series, slen, alpha, beta, gamma,
                                     n_preds):
        result = list()
        seasonals = self.initial_seasonal_components(series, slen)
        for i in range(len(series) + n_preds):
            if i == 0:  # initial values
                smooth = series[0]
                trend = self.initial_trend(series, slen)
                continue
            if i >= len(series):  # we are forecasting
                m = i - len(series) + 1
                new_level = (smooth + m * trend) + float(seasonals[i % slen])
                new_level = float(0) if new_level < 0 else new_level
                result.append(new_level)
            else:
                val = series[i]
                last_smooth, smooth = smooth, alpha * (
                    val - seasonals[i % slen]) + (1 - alpha) * (smooth + trend)
                trend = beta * (smooth - last_smooth) + (1 - beta) * trend
                seasonals[i % slen] = gamma * (val - smooth) + (
                    1 - gamma) * seasonals[i % slen]
        return result

    # This is the method called by the PFramework to initiate the generation of data using the Holt-Winters algorithm.
    # What follows will correct the gaps within the provided time series dataset, pass this into the triple exponential
    # smoothing algorithm and return the predicted datapoints (672 or one week) with their corresponding timestamps
    def call_model(self):
        # build dataframe
        df = fg.fill_data_gaps(self.returned_data_frame.shape[0],
                               init_data=self.returned_data_frame)
        df['avg_hrcrx_max_byt'] = df['avg_hrcrx_max_byt'].fillna(0)
        self.data_column_name = df.columns[1]

        # create list from dataframe to pass to triple exponential smoothing
        tmp_series = list(df.values.flatten())
        tmp_default_series = tmp_series[1::2]

        # Build training set based on specified number of training weeks
        tmp_training_count = self.default_num_train_weeks * self.default_stride_length
        self.default_series = tmp_default_series[0:tmp_training_count - 1]

        # call triple_exponential_smoothing with series = byte counts column in dataframe
        smooth_series = self.triple_exponential_smoothing(
            self.default_series, self.default_stride_length,
            self.default_alpha, self.default_beta, self.default_gamma,
            self.default_num_predictions)

        # generate 672 new new sequential timestamps (per 15 min) from start of prediction period
        start_date = df[''][tmp_training_count]
        result_datetimes = pd.date_range(start_date,
                                         periods=len(smooth_series),
                                         freq='15min')[0:]

        # assign new timestamps to datapoints
        nparray_data = np.array([result_datetimes, smooth_series]).transpose()

        # pass back completed dataframe or generate new csv file.
        return nparray_data

    def get_data_column_name(self):
        return self.data_column_name
示例#13
0
 def setUp(self):
     test_csv_write = CsvWriter(self.host, self.port, self.username, self.password, self.database)
     self.assertNotEqual(0, test_csv_write._client, "Class generated properly")
class TrafficPredictor:
    _default_stride = None
    _num_of_series = None
    _selected_model = None
    _data_writer = None

    def __init__(self, database="predicted_data"):
        self._default_stride = Stride.WEEKLY
        self._num_of_series = 8
        self._selected_model = None
        self._data_writer = CsvWriter(host=db_config.host,
                                      port=db_config.port,
                                      username=db_config.username,
                                      password=db_config.password,
                                      database=database)

    def main(self):
        print("Welcome to the Traffic Predictor!")
        print("Please choose your model (enter its index):")
        for model in models:
            x = models.index(model)
            print("{0}: {1}".format(x, model))
        print("-: Exit")

        selection = input("Prompt: ")

        if selection == '-':
            return
        else:
            try:
                model = models[int(selection)]
                print("Please, wait...")
                np = self.call_model(model)
                df = self.nparray_to_dataframe(np)
                print("Finished prediction")
                print(
                    "Would you like to run Error analysis on the predicted data? [y]/[n]"
                )
                selection = input("Prompt: ")
                if selection.lower() == 'y':
                    err_analysis = ErrorAnalysis(np)
                    err_analysis.compute_error()
                print(
                    "Would you like to write predicted data to database? [y]/[n]"
                    "\nIf selected [n] the data will be written to local csv file"
                )
                selection = input("Prompt: ")
                if selection.lower() == 'y':
                    self.write_data_to_database(model, df)
                else:
                    self.write_data_to_csv(model, df)

            except IndexError:
                print("There's no model under index: {}".format(selection))
            except TypeError:
                print(
                    "ERROR: The model import failed. Please make sure to properly add/choose your model."
                )
                raise TypeError

    def call_model(self, model_name):
        model_root = 'PModules.' + model_name + "." + model_name + "." + model_name
        model = locate(model_root)
        self._selected_model = model()
        # Your model class instance
        self._selected_model.set_parameters()
        result = self._selected_model.call_model()

        return result

    def write_data_to_csv(self, model_name, df):
        if not isinstance(df, pd.DataFrame):
            print(
                "Error reading the data from database. Please test this query in Chronograf/Grafana."
            )
        df.to_csv(path.join(RESOURCES_DIR, model_name + "_predicted.csv"))

    def write_data_to_database(self, model_name, df):
        df.to_csv(path.join(RESOURCES_DIR, model_name + "_predicted.csv"))
        self._data_writer.csv_file_to_db(
            measurement_to_use=model_name + '_predicted',
            new_csv_file_name=path.join(RESOURCES_DIR,
                                        model_name + "_predicted.csv"))
        remove(path.join(RESOURCES_DIR, model_name + "_predicted.csv"))

    def nparray_to_dataframe(self, nparray_data):
        indexes = pd.DataFrame(nparray_data[:, 0])
        indexes[0] = pd.to_datetime(indexes[0], format='%Y-%m-%d %H:%M:%S')
        cols = [self._selected_model.get_data_column_name()]
        df = pd.DataFrame(data=nparray_data[0:, 1:],
                          index=indexes[0],
                          columns=cols)
        return df
示例#15
0
class SimpleMovingAverage:
    """
    Calculates the Simple Moving Average on a daily/weekly basis
     This class makes N different series depending on the selected stride, 96(4*24) in the case of daily and 672(4*24*7)
     in the case of weekly. This is under the assumption that there is a periodic relationship in the data. For example,
     for a daily stride, it is being assumed that there is a correlation between all the 9AM values that occur, and the
     prediction for the next 9AM value is a moving average of all the selected days before it.

    :param default_stride: represents the stride for calculating the moving average (DAILY/WEEKLY)
    :param window_length: Number of days in a single series
    :param data_file: Name of the file that exists in the predictor_resources folder
    :return: numpy array object with two columns, a timeseries object(in epoch format) and the predicted bytecount

    """

    formattedInput = []
    lastDate = ""
    data_column_name = ""

    def __init__(self,
                 default_stride=Stride.WEEKLY,
                 window_length=8,
                 data_file="access_Point_1_incoming.csv"):
        self.defaultStride = default_stride
        self.windowLength = window_length
        self.csvWriter = CsvWriter(host="",
                                   port=0,
                                   username="",
                                   password="",
                                   database="",
                                   new_measurement="",
                                   new_cvs_file_name="")

        # Data returned as two columns. One with timeseries and other with bytecount values
        self.returned_data_frame = self.csvWriter.csv_file_to_dataframe(
            new_filepath=path.join(RESOURCES_DIR, data_file),
            new_row_start=0,
            new_row_end=self.defaultStride.value * self.windowLength)

    def set_parameters(self):
        """
        Asking user to change a parameters specific to a model, if needed
        :return:
        """
        print("The default stride: {}".format(self.defaultStride.name))
        print("The default number of  series: {}".format(self.windowLength))
        print(
            "Would you like to set the parameters for Simple Moving Average first? [y]/[n]"
        )
        selection = input("Prompt: ")
        if selection.lower() == 'y':
            print("Choose the stride (WEEKLY/DAILY): [W]/[D]")
            selection = input("Prompt: ")
            if selection.upper() == 'W':
                self.defaultStride = Stride.WEEKLY
            if selection.upper() == 'D':
                self.defaultStride = Stride.DAILY
            print("Choose the number of series.")
            selection = input("Prompt: ")
            if self.defaultStride == Stride.DAILY and int(selection) < 7:
                print(
                    "You cannot use training set less than 7 days. It will be left as a default"
                )
            if self.defaultStride == Stride.WEEKLY and int(selection) > 52:
                print(
                    "The number of series cannot exceed one year. It will be left as a default"
                )
            else:
                self.windowLength = int(selection)

    def initialize_dataframe_output(self):
        # Input formatting for future calculation
        numpy_array = np.array(self.returned_data_frame)[:, 1]
        numpy_array = numpy_array.reshape(
            (numpy_array.size // self.defaultStride.value,
             self.defaultStride.value)).transpose()
        self.formattedInput = numpy_array

        # Getting the last day in the "training" data. Used to generate the output timeseries later
        self.lastDate = np.array(self.returned_data_frame)[-1:, :-1][0][0]

    def call_model(self):
        self.initialize_dataframe_output()
        numpy_array = self.formattedInput

        # makes a numpy array of length(windowLength) and divides each with the scalar value of window.length
        x = np.ones(self.windowLength) / self.windowLength

        if self.defaultStride == Stride.DAILY:
            loop_count = 7
        elif self.defaultStride == Stride.WEEKLY:
            loop_count = 1

        # Calculating moving average here
        for i in range(loop_count):
            y = signal.convolve(numpy_array, [x], mode="valid")
            numpy_array = np.concatenate((numpy_array, y), axis=1)
            numpy_array = numpy_array[:, 1:]

        if self.defaultStride == Stride.DAILY:
            predictions = numpy_array[:, -7:]
            predictions = predictions.transpose().reshape(1,
                                                          predictions.size)[0]

        elif self.defaultStride == Stride.WEEKLY:
            predictions = numpy_array[:, -1]

        # Creates a numpy array(One week long), because the function is inclusive getting rid of the first element
        result_datetimes = pd.date_range(self.lastDate,
                                         periods=Stride.WEEKLY.value + 1,
                                         freq='15min')[1:]
        nparray_data = np.array([result_datetimes, predictions]).transpose()
        self.data_column_name = self.returned_data_frame.columns[1]
        return nparray_data

    def get_data_column_name(self):
        return self.data_column_name
"""
from predictor_resources.config import RESOURCES_DIR
from predictor_resources import db_config
import sys
from os import path, remove
from root import ROOT_DIR

sys.path.append(path.join(ROOT_DIR, 'CPacket-Common-Modules'))
from io_framework.csv_writer import CsvWriter
from io_framework.db_connector.db_connector import InfluxDBConnector
from io_framework.csv_fill_data_gaps import fill_data_gaps

database = 'AccessPoints'  # choose this if you want to use different DB
data_processor = CsvWriter(host=db_config.host,
                           port=db_config.port,
                           username=db_config.username,
                           password=db_config.password,
                           database=database)
connector = InfluxDBConnector(host=db_config.host,
                              port=db_config.port,
                              database=database)


def data_with_filled_gaps_to_db(file_path=None, new_measurement=None):
    df = data_processor.csv_file_to_dataframe(
        new_filepath=file_path)  # Change usecols here if you need
    dr = fill_data_gaps(init_data=df)
    dr.set_index('', inplace=True)
    dr.to_csv(path_or_buf=path.join(RESOURCES_DIR, "temp.csv"))
    data_processor.csv_file_to_db(measurement_to_use=new_measurement,
                                  new_csv_file_name=path.join(