示例#1
0
def testNetwork(config):
    ''' Train all data using a session saved at config.path_savedSession '''
    pl_input, pl_output, nn, saver, graph, _ = setupNet(config)
    with tf.Session(graph=graph) as sess:
        saver.restore(sess, config.path_savedSession)
        debugInfo(__name__, "Restored session")
        #return test_DataPrintOutput(nn,sess,pl_input,pl_output,config,fileName = config.path_outputFile)
        prediction = test_nonRandomizedPrediction(nn, sess, pl_input,
                                                  pl_output, config)

    sz_o = config.data.getNumberOutputs()
    output = pd.DataFrame(
        np.empty((config.data.getNumberTestPoints(), 2 * sz_o)))
    y = dsh.denormalizeData(config.data.test.outputData, config.data.max_value)
    y_ = dsh.denormalizeData(prediction, config.data.max_value)
    output.iloc[:, 0:sz_o] = y
    output.iloc[:, sz_o:2 * sz_o] = y_
    output.index = config.data.test.rowNames
    debugInfo(__name__,
              "Printing prediction output to %s" % config.path_outputFile)
    output.to_csv(config.path_outputFile,
                  header=[(lambda x: "i_%d" % x)(to)
                          for to in config.timeOffsets] +
                  [(lambda x: "o_%d" % x)(to) for to in config.timeOffsets])

    mae = np.mean(np.abs(y - y_), 0)
    print(mae)
    return mae
示例#2
0
def trainNetwork(config):

    pl_input, pl_output, nn, saver, graph, summary_op = setupNet(config)

    with tf.Session(graph=graph) as sess:

        summary_writer = tf.train.SummaryWriter(config.path_TFoutput,
                                                sess.graph)

        sess.run(tf.initialize_all_variables())

        for step in range(config.max_steps):
            myFeedDict = config.data.test.fill_feed_dict(
                pl_input, pl_output, Configuration.batch_size)

            loss_value, predicted = sess.run([nn.optimize, nn.prediction],
                                             feed_dict=myFeedDict)
            if (step % Configuration.test_step == 0):
                if (args.trackPredictions != None):
                    test_allDataAppendToDf(nn, sess, pl_input, pl_output,
                                           config_track,
                                           int(step / config.test_step) + 1)
                #debugInfo(__name__,dsh.denormalizeData(predicted,config.data.max_value))
                #summary_writer.add_summary(summary_str)
                #summary_writer.flush()

                mean = sess.run(nn.evaluation, feed_dict=myFeedDict)
                debugInfo(
                    __name__,
                    "Training step : %d of %d" % (step, config.max_steps))
                debugInfo(
                    __name__, "Mean test error is %f" %
                    dsh.denormalizeData(mean, config.data.max_value))
        path_savedSession = saver.save(sess, config.path_savedSession)
示例#3
0
def testNetwork(config):
    ''' Train all data using a session saved at config.path_savedSession '''
    pl_input, pl_output, nn, saver, graph, _ = setupNet(config)
    with tf.Session(graph=graph) as sess:
        saver.restore(sess, config.path_savedSession)
        debugInfo(__name__, "Restored session")
        #return test_DataPrintOutput(nn,sess,pl_input,pl_output,config,fileName = config.path_outputFile)
        prediction = test_nonRandomizedPrediction(nn, sess, pl_input,
                                                  pl_output, config)

    output = pd.DataFrame(
        np.empty((config.data.getNumberTrainingPoints(),
                  2 * config.number_target_neurons)))
    y = dsh.denormalizeData(
        config.data.train.outputData.reshape(
            [-1, config.number_target_neurons]), config.data.max_value)
    y_ = dsh.denormalizeData(
        prediction.reshape([-1, config.number_target_neurons]),
        config.data.max_value)
    output.iloc[:, 0:config.number_target_neurons] = y
    output.iloc[:, config.number_target_neurons:2 *
                config.number_target_neurons] = y_

    #output.index = config.data.train.rowNames
    output.to_csv(config.path_outputFile)
    debugInfo(__name__,
              "Printing prediction output to %s" % config.path_outputFile)

    mae = np.mean(np.abs(y - y_), 0)
    print(mae)
    return mae
示例#4
0
def read_csv_and_pivot_with_rollingAvg(inputFile, specifiedSensors = None, sql_headers = ['S_IDX','ZEIT','wert'], window = 15):

    data_wide_all = pivot_simple(inputFile, specifiedSensors, sql_headers)
    data_wide_all = data_wide_all.rolling(window,min_periods =1).mean()
    debugInfo(__name__,"Calculated the rolling average using window %d : (%d, %d)"%(window,data_wide_all.shape[0],data_wide_all.shape[1]))

    return data_wide_all
示例#5
0
文件: CNN.py 项目: tony32769/dlsd
	def evaluate(self):
		debugInfo(__name__,"Adding Evaluation nodes to the graph")
		predictions = self.prediction
		rounded = tf.round(predictions)
		correct_prediction = tf.equal(rounded,self.targets)
		accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))
		return accuracy,correct_prediction,predictions,rounded
示例#6
0
def removeInefficientSensors(data_wide_all, sensorEfficiency):
    #count the number of times each column has an 'na' value
    counts = np.zeros((data_wide_all.shape[1], 1))
    print(data_wide_all.shape)
    for i in range(0, data_wide_all.shape[1]):
        counts[i] = len(np.where(np.isnan(data_wide_all.iloc[:, i]))[0])

    # calculate the efficiency of the sensor
    sensorsToEfficiency = pd.DataFrame(np.zeros((3, counts.shape[0])))
    sensorsToEfficiency.iloc[0, :] = data_wide_all.columns.values.reshape(
        1, -1)
    sensorsToEfficiency.iloc[
        2, :] = 1 - counts.reshape(1, -1) / (data_wide_all.shape[0])
    sensorsToEfficiency.iloc[1, :] = counts.reshape(1, -1)

    # make table containing only efficient sensors (only columns with <10 nan are used)
    efficientSensorIndices = np.where(
        sensorsToEfficiency.iloc[2, :].values > sensorEfficiency)
    data_wide = data_wide_all.iloc[:, efficientSensorIndices[0]]
    data_wide.shape
    debugInfo(
        __name__, "Data where sensors have efficiency > %.2f : (%d, %d)" %
        (sensorEfficiency, data_wide.shape[0], data_wide.shape[1]))
    debugInfo(
        __name__,
        "There are %d sensors in total, but only %d have efficiency > %.2f" %
        (data_wide_all.shape[1], data_wide.shape[1], sensorEfficiency))
    return data_wide
示例#7
0
 def error(self):
     debugInfo(__name__,"Adding MNIST Error nodes to the graph")
     # using l2 norm (sum of) square error
     final_error = tf.sub(self.target,self.prediction,name="myError")
     tf.histogram_summary("final_error",final_error)
     mean = tf.reduce_mean(final_error,0)
     tf.histogram_summary("mean_error",mean)
     return final_error
示例#8
0
 def evaluation(self):
     debugInfo(__name__,"Adding Evaluation nodes to the graph")
     # using l2 norm (sum of) square error
     final_error = tf.abs(tf.sub(self.target,self.prediction,name="myEvaluationError"))
     #tf.histogram_summary("evaluation_final_error",final_error)
     mean = tf.reduce_mean(final_error)
     #tf.scalar_summary("evaluation_mean_error",mean)
     return mean
示例#9
0
 def toString(self):
     debugInfo(
         __name__,
         "FullDataSet Object : [ Train : input (%d, %d)  output (%d, %d) ]\t [ Test : input (%d, %d)  output (%d, %d) ]"
         % (self.train.inputData.shape[0], self.train.inputData.shape[1],
            self.train.outputData.shape[0], self.train.outputData.shape[1],
            self.test.inputData.shape[0], self.test.inputData.shape[1],
            self.test.outputData.shape[0], self.test.outputData.shape[1]))
示例#10
0
def prepareData(data_wide,
                indexOutputSensor,
                inputFunction,
                config=None,
                adjacency=None):
    '''
        Creates a dataframe containing desired input/output within the same table
        Args:
            data_wide : numpy array of all data (eg pivoted and smooth sqlToNumpy output)
            indexOutputSensor : the sensor to be predicted
            timeOffsets : python list of desired output times
            inputFunction : pd_ function (1 of 8) that formats input data in the desired manner
            adjacency : optional : a single numpy vector
            sequential : optional : python list (like timeOffsets) specifying which time points as input

            target :                 input : 



            t_5                     t_0
            t_6                     t_1
            .                       .
            .                       .
            t15                     t_10
    '''
    # input data is moved vertically down by max of timeOffsets
    #max_output = max(timeOffsets)
    max_output = 0

    index_output_begin = max(config.rnn_input_time_sequence) + min(
        config.rnn_target_time_sequence)

    i = inputFunction(
        data_wide,
        indexOutputSensor,
        s=config.rnn_input_time_sequence,
        a=adjacency,
        max_output=0)[
            max(config.rnn_input_time_sequence):-index_output_begin, :]
    #df = pd.DataFrame(i)
    #df.to_csv("/Users/ahartens/Desktop/input.csv")

    i = i.reshape([i.shape[0], len(config.rnn_input_time_sequence), -1])
    print(i.shape)
    debugInfo(__name__,
              "Preparing data : %d inputs %d" % (i.shape[1], i.shape[0]))
    # create 'output' data :
    #o = timeOffsetData(data_wide[:,indexOutputSensor],timeOffsets,b=max(sequential))
    o = data_wide[index_output_begin:, indexOutputSensor]
    #df = pd.DataFrame(o)
    #df.to_csv("/Users/ahartens/Desktop/output.csv")

    o = o.reshape([o.shape[0], len(config.rnn_target_time_sequence), 1])
    debugInfo(__name__,
              "Preparing data : %d outputs %d" % (o.shape[1], o.shape[0]))

    # combine input/output in one dataframe
    return i, o, i.shape[1]
示例#11
0
文件: CNN.py 项目: tony32769/dlsd
	def error(self):
		debugInfo(__name__,"Adding Error nodes to the graph")
		# using l2 norm (sum of) square error
		#error_op = tf.square(tf.sub(self.targets,self.prediction),name="error")
		cross_entropy = tf.reduce_mean(-tf.reduce_sum(self.targets * tf.log(self.prediction), reduction_indices=[1]))


		tf.histogram_summary("error",cross_entropy)
		return cross_entropy
示例#12
0
 def evaluation(self):
     debugInfo(__name__,"Adding MNIST Evaluation nodes to the graph")
     # using l2 norm (sum of) square error
     prediction = self.prediction
     predictions = tf.argmax(prediction,1)
     targets = tf.argmax(self.target,1)
     counts = tf.to_float(tf.equal(predictions,targets,"Check_Equal"))
     #tf.scalar_summary("evaluation_mean_error",mean)
     mean = tf.reduce_mean(counts)
     return mean, counts, predictions, targets
示例#13
0
 def prediction(self):
     debugInfo(__name__,"Adding Prediction nodes to the graph")
     with tf.name_scope('layer1'):
         weights = tf.Variable(tf.truncated_normal((self.n_input,self.n_hidden),stddev=0.1), name="lay1_weights")
         bias = tf.Variable(tf.constant(0.1,shape=[self.n_hidden]), name = "lay1_bias")
         out_layer1 = tf.nn.sigmoid(tf.matmul(self.data,weights)+bias, name = "lay1_output")
     with tf.name_scope('layer2'):
         weights = tf.Variable(tf.truncated_normal((self.n_hidden,self.n_output),stddev=0.1), name="lay2_weights")
         bias = tf.Variable(tf.constant(0.1,shape=[self.n_output]), name="lay2_bias")
         out_layer2 = tf.nn.sigmoid(tf.matmul(out_layer1,weights)+bias, name = "lay2_output")
     return out_layer2
示例#14
0
def splitDataToTrainAndTest(data_df, train_frac):
    '''
        @   param data_df       Pandas dataframe object of all data, each row is data point
        @   param train_frac    Float determining how much reserved for training
        
        @   return  train,test  Two pandas dataframes                 
    '''
    debugInfo(__name__,
              "Splitting data to train and test fraction %.2f" % (train_frac))
    train = data_df.sample(frac=train_frac, random_state=1)
    test = data_df.loc[~data_df.index.isin(train.index)]
    return train, test
示例#15
0
def read_csv_and_pivot(inputFile, specifiedSensors = None, sql_headers = ['S_IDX','ZEIT','wert']):
    '''
        First step of all further analyses :
        Long/narrow dataset from SQL is made wide (one column per sensor, time stamps are rows)
        Format of input file must be [S_IDX,ZEIT,wert]
        Args : 
            inputFile :         Path to csv file generated by sql
            specifiedSensors :  numpy array of sensor Ids that should be used. If present then inefficients sensors not removed
            window (opt)     :  If included, performs smoothing operation. If not provided no smoothing is done!
        Return :
            data_wide_all :     Pandas dataframe containing desired data
    '''
    debugInfo(__name__,"Beginning to read file")
    all_data = pd.read_csv(inputFile,sep=",")
    debugInfo(__name__,"Read input SQL file with shape : (%d, %d)"%(all_data.shape[0],all_data.shape[1]))

    if (specifiedSensors is not None):
        debugInfo(__name__,"%d Sensors specified, getting indices from"%specifiedSensors.shape[0])
        sensorIndices = np.where(all_data.iloc[:,0].values==specifiedSensors.values)[1]
        all_data = all_data.iloc[sensorIndices,:]
    
    # make into a wide table
    data_wide_all = all_data.pivot(index=sql_headers[1], columns=sql_headers[0], values=sql_headers[2])
    debugInfo(__name__,"Pivoted input shape : (%d, %d)"%(data_wide_all.shape[0],data_wide_all.shape[1]))
 
    return data_wide_all
示例#16
0
def formatFromSQL(path_sqlFile=None,
                  path_preparedData=None,
                  specifiedSensorsArray=None):
    # remake data from SQL output and min/max normalize it
    if (path_sqlFile is not None):
        debugInfo(__name__,
                  "Processing data from an SQL file %s" % path_sqlFile)
        data_df, _, specifiedSensors = stn.pivotAndSmooth(
            path_sqlFile, specifiedSensorsArray)
        data_df, max_value = dsh.normalizeData(data_df)
    # If no SQL data then open file and min/max normalize data
    else:
        debugInfo(__name__,
                  "Opening preprocessed data file %s" % path_preparedData)
        data_df, max_value = dsh.normalizeData(pd.read_csv(path_preparedData))
    return data_df, max_value, specifiedSensors
示例#17
0
	def calculate_average_week():
		
		df = self.df.values
		length_week = 7*1440
		num_sensors = df.shape[1]
		num_weeks = df.shape[0]/length_week

		df_avg = np.zeros([length_week,num_sensors])

		debugInfo(__name__,"Data successfully prepared, finding average of %d weeks"%num_weeks)

		for time_in_week in range(0,length_week):
		    # get indices of all rows corresponding to a certain time of the day/week 
		    idxs_for_time_n = [(length_week*week_idx)+time_in_week for week_idx in range(0,int(num_weeks))]
		    # (eg monday 00:02) is equal to the average of every monday at 00:02
		    df_avg[time_in_week] = np.nanmean(df[idxs_for_time_n],0)

		self.df = pd.DataFrame(df_avg)
示例#18
0
    def prediction(self):
        debugInfo(__name__,"Adding LSTM Prediction nodes to the graph")
        
        cell = tf.nn.rnn_cell.LSTMCell(num_units=self.n_hidden,state_is_tuple=True)
        
        outputs,last_states = tf.nn.dynamic_rnn(
            cell=cell,inputs=self.data,dtype=tf.float32)

        last_output = outputs[:,self.rnn_number_steps-1,:]
        # outputs contains an tensor with shape ( batch size, rnn_sequence_length , n_hidden)
        # only the rnn layers or connected! to create the output layer of proper size need a new activation function!
        
        # activation function for output layer
        with tf.name_scope('outputLayer'):
            weights = tf.Variable(tf.truncated_normal((self.n_hidden,self.n_output),stddev=0.1), name="lay2_weights")
            bias = tf.Variable(tf.constant(0.1,shape=[self.n_output]), name="lay2_bias")
            out_layer2 = tf.nn.sigmoid(tf.matmul(last_output,weights)+bias, name = "lay2_output")
        return out_layer2
示例#19
0
def setupNet(config):
    graph = tf.Graph()
    with graph.as_default(), tf.device('/cpu:0'):
        pl_input = tf.placeholder(tf.float32,
                                  shape=[None,
                                         config.data.getNumberInputs()],
                                  name="input_placeholder")
        pl_output = tf.placeholder(
            tf.float32,
            shape=[None, config.data.getNumberOutputs()],
            name="target_placeholder")
        # create neural network and define in graph
        debugInfo(__name__, "Creating neural network")
        nn = model.SimpleNeuralNetwork(pl_input, pl_output, config.n_hidden,
                                       config.learningRate)
        saver = tf.train.Saver()
        summary_op = tf.merge_all_summaries()

        return pl_input, pl_output, nn, saver, graph, summary_op
示例#20
0
    def __init__(self, data, target, number_hidden_nodes, learning_rate, rnn_number_steps=None):
        '''
            Args : 
                data :                      tensorflow placeholder to hold input data
                target :                    tensorflow placeholder to hold (true) output data (target value)
                number_hidden_nodes : 
                learning_rate :

        '''
        # data and target are placeholders
        self.data = data
        self.target = target
        # define hyperparameters of network
        self.n_input = int(self.data.get_shape()[1])
        self.n_hidden = number_hidden_nodes
        self.n_output = int(self.target.get_shape()[1])
        self.learningRate = learning_rate
        self.rnn_number_steps = rnn_number_steps
        debugInfo(__name__,"#input : %d   #hidden : %d   #output : %d   learningRate : %.2f"%(self.n_input,self.n_hidden,self.n_output,self.learningRate))
        # reference operation attributes of model
        self.addAttributes()
示例#21
0
    def fill_time_gaps(self, orig_df, time_format='%Y-%m-%d %H:%M:%S.%f'):
        '''
			Public method for filling time gaps
			Params
				self.orig_df : panda data frame containing wide data (one row per time stamp, one column per sensor)
				self.time_format : format that time stamp is to be parsed with. some differences exist (no millisecond etc)
			Returns
				[self.new_df, self.new_time_stamps]
		'''
        self.time_format = time_format
        self.orig_df = orig_df

        self.convert_orig_time_stamps_as_datetime_objects()

        if self.count_gaps(self.orig_time_stamps) is 0:
            debugInfo(__name__, "No time gaps found")
            self.convert_datetime_objects_to_orig_time()
            return self.orig_df

        debugInfo(__name__, "Time gaps found, beginning to fill")

        return self.fill_gaps()
示例#22
0
def setupNet(config):
    '''
        Creates the operation graph in tensorflow
        returns all components necessary for training/testing
    '''
    graph = tf.Graph()
    with graph.as_default(), tf.device('/cpu:0'):

        # batch size, number of input sequences, size of input sequence
        pl_input = tf.placeholder(tf.float32,
                                  shape=[
                                      None,
                                      len(config.rnn_input_time_sequence),
                                      config.number_input_neurons
                                  ],
                                  name="input_placeholder")

        # batch size, number of outputs (which is equal to number of input sequences), size of output (predicts 5 timeputs every )
        pl_output = tf.placeholder(tf.float32,
                                   shape=[
                                       None,
                                       len(config.rnn_target_time_sequence),
                                       config.number_target_neurons
                                   ],
                                   name="target_placeholder")

        # create neural network and define in graph
        debugInfo(__name__, "Creating neural network")
        nn = lstm(data=pl_input,
                  target=pl_output,
                  number_hidden_nodes=config.n_hidden,
                  learning_rate=config.learningRate,
                  rnn_number_steps=len(config.rnn_input_time_sequence))

        saver = tf.train.Saver()
        summary_op = tf.merge_all_summaries()

        return pl_input, pl_output, nn, saver, graph, summary_op
示例#23
0
def prepareData(data_wide,
                indexOutputSensor,
                timeOffsets,
                inputFunction,
                adjacency=None,
                sequential=[0]):
    '''
        Creates a dataframe containing desired input/output within the same table
        Args:
            data_wide : numpy array of all data (eg pivoted and smooth sqlToNumpy output)
            indexOutputSensor : the sensor to be predicted
            timeOffsets : python list of desired output times
            inputFunction : pd_ function (1 of 8) that formats input data in the desired manner
            adjacency : optional : a single numpy vector
            sequential : optional : python list (like timeOffsets) specifying which time points as input
    '''
    # input data is moved vertically down by max of timeOffsets
    max_output = max(timeOffsets)
    max_sequential = max(sequential)

    i = inputFunction(data_wide,
                      indexOutputSensor,
                      s=sequential,
                      a=adjacency,
                      max_output=max_output,
                      max_sequential=max_sequential)
    debugInfo(__name__,
              "Preparing data : %d inputs %d" % (i.shape[1], i.shape[0]))
    # create 'output' data :
    o = timeOffsetData(data_wide[:, indexOutputSensor],
                       timeOffsets,
                       b=max(sequential))
    debugInfo(__name__,
              "Preparing data : %d outputs %d" % (o.shape[1], o.shape[0]))

    # combine input/output in one dataframe
    df = pd.DataFrame(np.hstack((i, o)))
    return df, i.shape[1]
示例#24
0
def make_average_week(filename, sql_headers = ['S_IDX','ZEIT','wert'], time_format = '%Y-%m-%d %H:%M:%S'):
	'''
		filename :		File path to csv file (sql output with 3 columns)
	'''

	self.time_format = time_format
	self.sql_headers = sql_headers
	debugInfo(__name__,"making average week : preparing data")
	

	df_pd = 

	df = df_pd.values
	
	length_week = 7*1440
	num_sensors = df.shape[1]
	num_weeks = df.shape[0]/length_week

	df_avg = np.zeros([length_week,num_sensors])

	debugInfo(__name__,"Data successfully prepared, finding average of %d weeks"%num_weeks)

	for time_in_week in range(0,length_week):
	    # get indices of all rows corresponding to a certain time of the day/week 
	    idxs = [(length_week*week_idx)+time_in_week for week_idx in range(0,int(num_weeks))]
	    # (eg monday 00:02) is equal to the average of every monday at 00:02
	    df_avg[time_in_week] = np.nanmean(df[idxs],0)

	df_avg_pd = pd.DataFrame(df_avg)

	# day of week as integer with sunday being 1
	avg_row_names = [datetime.datetime.strftime(i, '%w_%H:%M:%S') for i in new_row_names[0:df_avg.shape[0]]]
	
	df_avg_pd.index = avg_row_names
	df_avg_pd.columns = df_pd.columns.values
	
	return df_avg_pd
示例#25
0
    def check_if_has_gaps(self):

        if self.count_gaps(self.new_time_stamps) is 0:
            debugInfo(__name__, "Time gaps successfully filled")
        else:
            raise Exception("%d Gaps found!!!" % count)
示例#26
0
def normalizeData(data_df):
    max_value = np.nanmax(data_df.values)
    debugInfo(__name__, "Max value in maxMinNormalization is %.2f" % max_value)
    return ((data_df / max_value) * .99999999) + 0.00000001, max_value
示例#27
0
def main(args):

    config = Configuration(args)

    methods = [
        {
            'func': pd_1s_singleInput,
            'name': 'ffnn_simple',
            'adj': False
        },
    ]
    #{'func':pd_2s_allInput,'name':'ffnn_all','adj':False},]
    #{'func':pd_3s_adjacency_withSelf,'name':'ffnn_nn+','adj':True},
    #{'func':pd_4s_adj_noSelf,'name':'ffnn_nn','adj':True}]

    # create training data (all of july)
    data_df, max_value, specifiedSensors = formatFromSQL(
        path_sqlFile=config.path_sqlFile, sql_headers=config.sql_headers)

    # 182-185,281 are missing from adjacency matrix!! remove them! tell max, this needs to be changed!
    remove = [182, 183, 184, 185, 281]
    removeInx = []
    for i in remove:
        index = np.where(data_df.columns.values == i)[0]
        if len(index) > 0: removeInx.append(index[0])
    data_df.drop(data_df.columns[removeInx], axis=1, inplace=True)
    specifiedSensors = pd.DataFrame(data_df.columns.values)

    # create test data (one or more)
    config.test_dicts = []
    for path_test in config.path_sqlTestFile:
        test_df, test_max_value, _ = formatFromSQL(
            path_sqlFile=path_test,
            specifiedSensorsArray=specifiedSensors,
            sql_headers=config.sql_headers)
        config.test_dicts.append({
            'df':
            test_df,
            'max':
            test_max_value,
            'name':
            os.path.basename(os.path.normpath(path_test)).replace('.csv', '')
        })

    # create a list that contains the (function) index of the minimum MAE (averaged)
    path_idxsMinMae = os.path.join(config.path_outputDir, "indicesMinMaes.csv")
    idxMinMae_list = []

    ## FOR EACH SENSOR ##
    #for indexOutputSensor in range(0,data_df.shape[1]):
    for indexOutputSensor in range(0, 1):

        # create folder for current sensor
        debugInfo(__name__,
                  "SENSOR %d" % data_df.columns.values[indexOutputSensor])
        dir_sensor = os.path.join(
            config.path_outputDir,
            "s_%d" % data_df.columns.values[indexOutputSensor])
        dir_sensor_tf = os.path.join(dir_sensor, 'tf')

        if not os.path.exists(dir_sensor): os.makedirs(dir_sensor)
        if not os.path.exists(dir_sensor_tf): os.makedirs(dir_sensor_tf)

        # set up empty data frame and array for the average MAE of all testing data. first column is names of all the functions
        testSetInfo = []

        # create necessary paths and empty arrays for each training set
        for i in range(0, len(config.test_dicts)):
            avgMaes_df = pd.DataFrame(
                np.zeros((len(methods), len(config.rnn_target_time_sequence))))
            avgMaes_array = np.zeros(
                (len(methods), len(config.rnn_target_time_sequence)))
            path_avgMaes_df = os.path.join(
                dir_sensor, "avgMaesForSensor_%d.csv" %
                data_df.columns.values[indexOutputSensor])

            # create folders for current (test) data
            dir_test = os.path.join(dir_sensor, config.test_dicts[i]['name'])
            dir_test_tf = os.path.join(dir_test, 'tf')
            dir_test_results = os.path.join(dir_test, 'output')
            if not os.path.exists(dir_test): os.makedirs(dir_test)
            if not os.path.exists(dir_test_tf): os.makedirs(dir_test_tf)
            if not os.path.exists(dir_test_results):
                os.makedirs(dir_test_results)

            testSetInfo.append({
                'avgMaes_df': avgMaes_df,
                'avgMaes_array': avgMaes_array,
                'path_avgMaes_df': path_avgMaes_df,
                'dir_test': dir_test,
                'test_dr_tf': dir_test_tf,
                'dir_test_results': dir_test_results
            })

        ## FOR EACH DATA PREPARATION METHOD ##
        # this affects how the network is formed! changes every time
        for j in range(0, len(methods)):

            debugInfo(__name__,
                      "Using %s to prepare data" % methods[j]['name'])
            config.path_savedSession = os.path.join(
                dir_sensor_tf, "tfsession_%s.ckpt" % methods[j]['name'])

            debugInfo(__name__, "Creating Data for Training")
            config.data = makeDataSetObject(
                data_df=data_df,
                max_value=max_value,
                outputSensorIndex=indexOutputSensor,
                prepareData_function=methods[j]['func'],
                path_adjacencyMatrix=None if
                (methods[j]['adj'] == False) else config.path_adjacencyMatrix,
                config=config)
            # the number of data points used as input (per time point). eg all sensors, 1 sensor, only nearest neighbors etc
            config.number_input_neurons = config.data.train.inputData.shape[2]
            config.number_target_neurons = 1
            # train using training set
            trainNetwork(config)

            ## FOR EACH TEST DATA SET ##
            for i in range(0, len(config.test_dicts)):

                testData = config.test_dicts[i]

                debugInfo(__name__, "Creating Data for Testing")
                config.path_outputFile = os.path.join(
                    testSetInfo[i]['dir_test_results'],
                    "%s.csv" % methods[j]['name'])
                config.data = makeDataSetObject(
                    data_df=testData['df'],
                    max_value=testData['max'],
                    outputSensorIndex=indexOutputSensor,
                    prepareData_function=methods[j]['func'],
                    path_adjacencyMatrix=None if (methods[j]['adj'] == False)
                    else config.path_adjacencyMatrix,
                    config=config)
                maes = testNetwork(config)

                testSetInfo[i]['avgMaes_df'].iloc[j, :] = maes

        # contains average maes for all test data sets (average of averages)
        avgMaeOverTests_df = pd.DataFrame(
            np.zeros((len(methods), len(config.rnn_target_time_sequence))))
        avgMaeOverTests_array = np.zeros(
            (len(methods), len(config.rnn_target_time_sequence)))
        path_avgMaeOverTests = os.path.join(
            dir_sensor, "avgMaesForSensor_%d.csv" %
            data_df.columns.values[indexOutputSensor])

        # iterate over all 'avgMae' tables (for each test data set)
        for i in range(0, len(config.test_dicts)):
            testSetInfo[i]['avgMaes_df'].index = np.array([
                (lambda x: x['name'])(funcDic) for funcDic in methods
            ])
            testSetInfo[i]['avgMaes_df'].to_csv(
                testSetInfo[i]['path_avgMaes_df'],
                header=([(lambda x: "t_%d" % x)(to)
                         for to in config.rnn_target_time_sequence]))
            avgMaeOverTests_array = testSetInfo[i][
                'avgMaes_df'].values + avgMaeOverTests_array

        # get average of maes for all test data
        avgMaeOverTests_array = avgMaeOverTests_array / len(config.test_dicts)
        avgMaeOverTests_df.iloc[:, :] = avgMaeOverTests_array
        avgMaeOverTests_df.index = [(lambda x: x['name'])(funcDic)
                                    for funcDic in methods]
        avgMaeOverTests_df.to_csv(path_avgMaeOverTests,
                                  header=([
                                      (lambda x: "t_%d" % x)(to)
                                      for to in config.rnn_target_time_sequence
                                  ]))

        # get index of function with lowest MAE and save
        idxMinMae_list.append(avgMaeOverTests_array.argmin(axis=0))

    idxMinMae_df = pd.DataFrame(
        np.hstack((specifiedSensors.values[0:2], np.array(idxMinMae_list))))
    idxMinMae_df.to_csv(path_idxsMinMae,
                        header=(['sensor'] +
                                [(lambda x: "t_%d" % x)(to)
                                 for to in config.rnn_target_time_sequence]))
示例#28
0
def makeDataSetObject(data_df,
                      max_value,
                      prepareData_function,
                      path_adjacencyMatrix,
                      outputSensorIndex,
                      config=None):
    '''
        Args : 
            inputFilePath :         Path to csv file 26_8_16_PZS_Belgugn_All_Wide_NanOmitecsv or similar
            remakeData :            Boolean : if True then inputFilePath refers to an SQL output file and the data is remade
            outputFilePath :        Path to outputfile if saveOutputFile is True
            timeOffset :            Int : number of minutes that 
        
        Return :
            theData :       FullDataSet object from dataset_helpers containing two DataSet 
                            objects containing two numpy arrays(input/target), contains next_batch() function!
    '''

    # define index of single output sensor (the output is at some time in the future)
    adjacencyForOutputSensor = None
    # add in the adjacency matrix

    if (path_adjacencyMatrix is not None):
        debugInfo(__name__, "Found an adjacency matrix : multiplying it in!")
        # 182-185,281 are missing from adjacency matrix!! remove them! tell max, this needs to be changed!
        #data_df=pd.DataFrame(data_df.iloc[:,5:data_df.shape[1]].values,columns=data_df.columns.values[5:data_df.shape[1]])

        # list of sensors columns that we are using
        desired = data_df.columns.values

        # read adjacency matrix
        adjMatrix_orig = pd.read_csv(path_adjacencyMatrix)

        # adjacency matrix csv has headers as type string, with columns 0,1 actual strings : rename all columns as ints!
        sensorsList = list(
            adjMatrix_orig.columns.values[2:adjMatrix_orig.shape[1]].astype(
                np.int64))
        columns = [0, 1] + sensorsList
        adjMatrix_orig.columns = columns

        # remove all columns (sensors) that we don't want, leaving only sensors that are desired
        # this uses header names to reference the columns that i want
        removed = adjMatrix_orig[desired]

        # get row index of single sensor being used for output (as a string) : this row is the adjacency!
        indexForSensorInMatrix = np.where(adjMatrix_orig.iloc[:, 1] == data_df.
                                          columns.values[outputSensorIndex])[0]
        adjacencyForOutputSensor = removed.iloc[
            indexForSensorInMatrix, :].values
        print(data_df.columns.values[np.where(
            adjacencyForOutputSensor[0] == 1)[0]])

    # create input and output vectors
    input_, output_, indexOutputBegin = prepareData(
        data_df.values,
        indexOutputSensor=outputSensorIndex,
        inputFunction=prepareData_function,
        config=config,
        adjacency=adjacencyForOutputSensor)

    debugInfo(__name__, "Making FullDataSet object containing train/test data")
    # create FullDataSet object with appropriate data
    theData = dsh.FullDataSet(trainInput=input_, trainOutput=output_)
    theData.max_value = max_value
    theData.train.rowNames = data_df.index[:-(
        max(config.rnn_target_time_sequence) - 1)]
    return theData
示例#29
0
def main(args):

    config = Configuration(args)

    methods = [
        {
            'func': pd_1_singleInput,
            'name': 'f1_singleInput',
            'adj': False
        },
    ]
    #{'func':pd_2_allInput,'name':'f2_allInput','adj':False},]
    #{'func':pd_3_adjacency_withSelf,'name':'pd_3_adjacency_withSelf','adj':True},
    #{'func':pd_4_adj_noSelf,'name':'pd_4_adj_noSelf','adj':True},
    #{'func':pd_1s_singleInput,'name':'f1s_singleInput','adj':False},
    #{'func':pd_2s_allInput,'name':'f2s_allInput','adj':False},
    #{'func':pd_3s_adjacency_withSelf,'name':'pd_3s_adjacency_withSelf','adj':True},
    #{'func':pd_4s_adj_noSelf,'name':'pd_4s_adj_noSelf','adj':True}]

    # create training data (all of july)
    data_df, max_value, specifiedSensors = formatFromSQL(
        path_sqlFile=config.path_sqlFile)

    # 182-185,281 are missing from adjacency matrix!! remove them! tell max, this needs to be changed!
    remove = [182, 183, 184, 185, 281]
    removeInx = []
    for i in remove:
        index = np.where(data_df.columns.values == i)[0]
        if len(index) > 0: removeInx.append(index[0])
    data_df.drop(data_df.columns[removeInx], axis=1, inplace=True)
    specifiedSensors = pd.DataFrame(data_df.columns.values)

    # create test data (one or more)
    config.test_dicts = []
    for path_test in config.path_sqlTestFile:
        test_df, test_max_value, _ = formatFromSQL(
            path_sqlFile=path_test, specifiedSensorsArray=specifiedSensors)
        config.test_dicts.append({
            'df':
            test_df,
            'max':
            test_max_value,
            'name':
            os.path.basename(os.path.normpath(path_test)).replace('.csv', '')
        })

    # create a list that contains the (function) index of the minimum MAE (averaged)
    path_idxsMinMae = os.path.join(config.path_outputDir, "indicesMinMaes.csv")
    idxMinMae_list = []

    #for indexOutputSensor in range(0,1):
    for indexOutputSensor in range(0, data_df.shape[1]):

        # create folder for current sensor
        debugInfo(__name__,
                  "SENSOR %d" % data_df.columns.values[indexOutputSensor])
        currentDir = os.path.join(
            config.path_outputDir,
            "s_%d" % data_df.columns.values[indexOutputSensor])
        if not os.path.exists(currentDir): os.makedirs(currentDir)

        # set up empty data frame and array for the average MAE of all testing data. first column is names of all the functions
        avgMaes_df = pd.DataFrame(
            np.zeros((len(methods), len(config.timeOffsets))))
        avgMaes_array = np.zeros((len(methods), len(config.timeOffsets)))
        path_avgMaes_df = os.path.join(
            currentDir, "avgMaesForSensor_%d.csv" %
            data_df.columns.values[indexOutputSensor])

        for i in range(0, len(config.test_dicts)):

            testData = config.test_dicts[i]
            # create folders for current data frame
            current_df_dir = os.path.join(currentDir, testData['name'])
            currentDir_tf = os.path.join(current_df_dir, 'tf')
            currentDir_rslts = os.path.join(current_df_dir, 'output')

            if not os.path.exists(current_df_dir): os.makedirs(current_df_dir)
            if not os.path.exists(currentDir_tf): os.makedirs(currentDir_tf)
            if not os.path.exists(currentDir_rslts):
                os.makedirs(currentDir_rslts)

            path_allMaesForSensor = os.path.join(
                current_df_dir, "allMaesForSensor_%d.csv" %
                data_df.columns.values[indexOutputSensor])
            all_maesForSensor = pd.DataFrame(np.zeros(avgMaes_df.shape))

            # iterate over each method and train a new network
            for j in range(0, len(methods)):
                debugInfo(__name__,
                          "Using %s to prepare data" % methods[j]['name'])
                config.path_savedSession = os.path.join(
                    currentDir_tf, "tfsession_%s" % methods[j]['name'])
                config.path_outputFile = os.path.join(
                    currentDir_rslts,
                    "predictions_%s.csv" % methods[j]['name'])

                #first train the network using all data points for july
                # if a non sequential input then sequential should be none
                config.sequential = [0] if (i < 4) else list(range(0, 5))

                debugInfo(__name__, "Creating Data for Training")

                config.data = makeDataSetObject(
                    data_df=data_df,
                    max_value=max_value,
                    timeOffsets=config.timeOffsets,
                    outputSensorIndex=indexOutputSensor,
                    sequential=config.sequential,
                    splitTrain=False,
                    path_adjacencyMatrix=None if (methods[j]['adj'] == False)
                    else config.path_adjacencyMatrix,
                    prepareData_function=methods[j]['func'])

                # train all data using n
                trainNetwork(config)

                #then test the network (after all training) using all data points from august

                debugInfo(__name__, "Creating Data for Testing")

                config.data = makeDataSetObject(
                    data_df=testData['df'],
                    max_value=testData['max'],
                    timeOffsets=config.timeOffsets,
                    outputSensorIndex=indexOutputSensor,
                    sequential=config.sequential,
                    splitTrain=False,
                    path_adjacencyMatrix=None if (methods[j]['adj'] == False)
                    else config.path_adjacencyMatrix,
                    prepareData_function=methods[j]['func'])

                maes = testNetwork(config)

                all_maesForSensor.iloc[j, :] = maes

            all_maesForSensor.index = np.array([(lambda x: x['name'])(funcDic)
                                                for funcDic in methods])
            all_maesForSensor.to_csv(path_allMaesForSensor,
                                     header=([(lambda x: "t_%d" % x)(to)
                                              for to in config.timeOffsets]))

            avgMaes_array = all_maesForSensor.values + avgMaes_array

        # get average of maes for all test data
        avgMaes_array = avgMaes_array / len(config.test_dicts)
        avgMaes_df.iloc[:, :] = avgMaes_array
        avgMaes_df.index = [(lambda x: x['name'])(funcDic)
                            for funcDic in methods]
        avgMaes_df.to_csv(path_avgMaes_df,
                          header=([(lambda x: "t_%d" % x)(to)
                                   for to in config.timeOffsets]))
        # get index of function with lowest MAE and save
        idxMinMae_list.append(avgMaes_array.argmin(axis=0))
    idxMinMae_df = pd.DataFrame(
        np.hstack((specifiedSensors.values[0:2], np.array(idxMinMae_list))))
    idxMinMae_df.to_csv(path_idxsMinMae,
                        header=(['sensor'] + [(lambda x: "t_%d" % x)(to)
                                              for to in config.timeOffsets]))
示例#30
0
def makeDataSetObject(data_df,
                      max_value,
                      prepareData_function,
                      outputSensorIndex,
                      sequential=None,
                      timeOffsets=None,
                      splitTrain=True,
                      trainTestFraction=.8,
                      path_adjacencyMatrix=None,
                      path_preparedData=None):
    '''
        Args : 
            inputFilePath :         Path to csv file 26_8_16_PZS_Belgugn_All_Wide_NanOmitecsv or similar
            remakeData :            Boolean : if True then inputFilePath refers to an SQL output file and the data is remade
            outputFilePath :        Path to outputfile if saveOutputFile is True
            timeOffset :            Int : number of minutes that 
        
        Return :
            theData :       FullDataSet object from dataset_helpers containing two DataSet 
                            objects containing two numpy arrays(input/target), contains next_batch() function!
    '''

    # define index of single output sensor (the output is at some time in the future)
    adjacencyForOutputSensor = None
    # add in the adjacency matrix
    if (path_adjacencyMatrix is not None):
        debugInfo(__name__, "Found an adjacency matrix : multiplying it in!")
        # 182-185,281 are missing from adjacency matrix!! remove them! tell max, this needs to be changed!
        #data_df=pd.DataFrame(data_df.iloc[:,5:data_df.shape[1]].values,columns=data_df.columns.values[5:data_df.shape[1]])

        # list of sensors columns that we are using
        desired = data_df.columns.values

        # read adjacency matrix
        adjMatrix_orig = pd.read_csv(path_adjacencyMatrix)

        # adjacency matrix csv has headers as type string, with columns 0,1 actual strings : rename all columns as ints!
        sensorsList = list(
            adjMatrix_orig.columns.values[2:adjMatrix_orig.shape[1]].astype(
                np.int64))
        columns = [0, 1] + sensorsList
        adjMatrix_orig.columns = columns

        # remove all columns (sensors) that we don't want, leaving only sensors that are desired
        # this uses header names to reference the columns that i want
        removed = adjMatrix_orig[desired]

        # get row index of single sensor being used for output (as a string) : this row is the adjacency!
        indexForSensorInMatrix = np.where(adjMatrix_orig.iloc[:, 1] == data_df.
                                          columns.values[outputSensorIndex])[0]
        adjacencyForOutputSensor = removed.iloc[
            indexForSensorInMatrix, :].values
        print(data_df.columns.values[np.where(
            adjacencyForOutputSensor[0] == 1)[0]])
    data_prepared, indexOutputBegin = prepareData(
        data_df.values,
        outputSensorIndex,
        timeOffsets,
        prepareData_function,
        adjacency=adjacencyForOutputSensor,
        sequential=sequential)

    print(data_prepared.shape)
    #rowNames = range(0,max(timeOffsets))+list(data_df.index) + range(0,max(sequential))
    #data_prepared.index = rowNames

    data_final_naDropped = data_prepared.dropna()

    debugInfo(
        __name__, "From %d total timepoints, %d are being used (%.2f)" %
        (data_prepared.shape[0], data_final_naDropped.shape[0],
         (data_final_naDropped.shape[0] / data_prepared.shape[0])))

    #data_final.to_csv("/Users/ahartens/Desktop/Temporary/24_10_16_wideTimeSeriesBelegung.csv")
    if (path_preparedData is not None):
        debugInfo(__name__,
                  "Saving processed file to %s" % (path_preparedData))
        data_final_naDropped.to_csv(path_preparedData, index=False)

    if (splitTrain == True):
        train_df, test_df = dsh.splitDataToTrainAndTest(
            data_final_naDropped, trainTestFraction)
        debugInfo(
            __name__, "train_df (%d,%d)\ttest_df (%d,%d)" %
            (train_df.shape[0], train_df.shape[1], test_df.shape[0],
             test_df.shape[1]))
        debugInfo(
            __name__, "Single output sensor at index %d, sensor name : %s" %
            (outputSensorIndex, data_df.columns.values[outputSensorIndex]))

        train_input = train_df.iloc[:, 0:indexOutputBegin]
        train_output = train_df.iloc[:, indexOutputBegin:data_final_naDropped.
                                     shape[1]]

        test_input = test_df.iloc[:, 0:indexOutputBegin]
        test_output = test_df.iloc[:, indexOutputBegin:data_final_naDropped.
                                   shape[1]]

        debugInfo(__name__,
                  "Making FullDataSet object containing train/test data")
        # create FullDataSet object with appropriate data
        theData = dsh.FullDataSet(trainInput=train_input.values,
                                  trainOutput=train_output.values,
                                  testInput=test_input.values,
                                  testOutput=test_output.values)
    # Don't split data into train/test (only for testing)
    else:
        test_input = data_final_naDropped.iloc[:, 0:indexOutputBegin]
        test_output = data_final_naDropped.iloc[:, indexOutputBegin:
                                                data_final_naDropped.shape[1]]
        debugInfo(__name__, "Making FullDataSet object with only test data")
        # create FullDataSet object with appropriate data
        theData = dsh.FullDataSet(trainInput=np.empty(test_input.shape),
                                  trainOutput=np.empty(test_output.shape),
                                  testInput=test_input.values,
                                  testOutput=test_output.values)
        theData.test.rowNames = data_final_naDropped.index
    theData.max_value = max_value
    theData.toString()

    return theData