def extract_features_training(self):
        '''
    This function creates the dataset using the 'load_data' function from 'Data.Read_Data'.
    It operates line-by-line so should be able to handle unlimited number of lines in data files.
    '''
        p0 = time.time()

        ### Calculate features one id at a time
        for dataset in self.datasets:
            load_file_path = Data_Path + '/' + dataset + '.dat'
            save_file_path = self.save_path + '/' + dataset + '_features.dat'

            #Delete existing
            print('save path: ' + save_file_path)
            if os.path.exists(save_file_path):
                os.remove(save_file_path)

            with open(save_file_path, 'wb') as save_file:

                ##Create the header
                header = ['id', 'label']
                for feature in self.features:
                    header += [
                        variable + '_' + feature for variable in self.variables
                    ]
                self.header = header

                pickle.dump(header, save_file)

                ##Calculate data
                for id, label, data in load_data(filename=load_file_path,
                                                 max_row=-1):

                    #Extract subset
                    data = np.array(data[self.variables])

                    #Create data_point
                    data_point = [id, label]
                    for feature in self.features:
                        feature_func = getattr(self, self.FEATURES[feature])
                        data_point += list(feature_func(ys=data))

                    #save data_point
                    pickle.dump(data_point, save_file)
                    #save_file.write(str(data_point[1:-1]))

                    if (id % 5000) == 0:
                        print('Dataset: ' + dataset + ', Line: ' + str(id) +
                              ' out of ' + str(self.n_lines[dataset]) +
                              ', Time: ' + str(int(time.time() - p0)) + 's')
示例#2
0
  def scale_norm_and_pca_test(self):
    '''
    This function creates the dataset using the 'load_data' function from 'Data.Read_Data'.
    It operates line-by-line so should be able to handle unlimited number of lines in data files.
    '''
    p0 = time.time()
    
    ### Calculate features one id at a time
    for dataset in self.datasets:
      load_file_path = Data_Path + '/' + dataset + '.dat'
      save_file_path = self.save_path + '/' + dataset + '_pca.dat'
      save_file_path_no_pca = self.save_path + '/' + dataset + '_no_pca.dat'
      
      #Delete existing
      print('save path: ' + save_file_path)
      if os.path.exists(save_file_path):
          os.remove(save_file_path)

      #Delete existing
      print('save path: ' + save_file_path_no_pca)
      if os.path.exists(save_file_path_no_pca):
          os.remove(save_file_path_no_pca)          
      
      with open(save_file_path,'wb') as save_file, open(save_file_path_no_pca,'wb') as save_file_no_pca:
        
        ##Calculate data
        for id, data in load_data(filename = load_file_path, max_row = -1):
            
            #Scale
            scaled_data = scale_df(data)
            normalized_data = self._normalize(scaled_data)
            pca_data = self._pca(normalized_data)
             
            #save data_point
            pickle.dump((id,normalized_data),save_file_no_pca)
            pickle.dump((id,pca_data),save_file)
            
            if (id % 5000) == 0:
              print('Dataset: ' + dataset + ', Line: ' + str(id) + ' out of ' + str(self.n_lines[dataset]) + ', Time: ' + str(int(time.time()-p0)) + 's')
示例#3
0
  def handle_NA_training(self):
    '''
    This function creates the dataset using the 'load_data' function from 'Data.Read_Data'.
    It operates line-by-line so should be able to handle unlimited number of lines in data files.
    '''
    p0 = time.time()

    ### Calculate features one id at a time
    for dataset in self.datasets:
      load_file_path = Data_Path + '/' + dataset + '.dat'
      save_file_path = self.save_path + '/' + dataset + '_NA.dat'

      #Delete existing
      print('save path: ' + save_file_path)
      if os.path.exists(save_file_path):
          os.remove(save_file_path)


      with open(save_file_path,'wb') as save_file:

        ##Calculate data
        for id, label, data in load_data(filename = load_file_path, max_row = -1):
            
            #There are 3 Types of NA:
            #1) There is no satelite data (all rows are NA)
            #2) The SHARP mask is empty (some NAs, the rest are zeros except XR_MAX)
            #3) The R mask is empty (R is zero)

            #For later use
            non_division_vars = data.columns.difference(['XR_MAX'] + self.division_variables)
            non_XR = data.columns.difference(['XR_MAX'])
            
            ##Check if satellite is mising (everythin but XR_MAX have NAs)
            NA_satellite_index = data.index[data[non_XR].isnull().all(1)]
            data['NA_satellite'] = 0
            data.at[NA_satellite_index,'NA_satellite'] = 1
            '''
            if len(NA_satellite_index) > 0:
                self.save_set = data
                print('missing satellite')
                return
            '''
            ##Check if no SHARPmask (self.division_variables are NaN, the rest are zero)
            indices = ((data[non_division_vars] == 0).all(axis = 1) & data[self.division_variables].isna().all(axis = 1))
            
            data['NA_SHARPmask'] = 0
            data.at[indices,'NA_SHARPmask'] = 1
            
            '''
            if len(indices) > 0:
                self.save_set = data
                print('missing sharp')
                return
            '''    
            ##Check if no Rmask
            data['NA_Rmask'] = 0
            data.at[data['R_VALUE'] == 0,'NA_Rmask'] = 1
            
            ##Check if no XR_dat
            data['NA_XR_MAX'] = 0
            data.at[data['XR_MAX'] == -99999,'NA_XR_MAX'] = 1

            ### Find rows with NA but both satellite data and SHARPmask (i think really few)
            NA_indices  = data.isna().any(axis = 'columns')
            NA_butsharp = (NA_indices & (data['NA_SHARPmask'] == 0) & (data['NA_satellite'] == 0))
            #if sum(NA_butsharp) > 0:
            #    print('id: ' + str(id) + ' har NA')
            
            ##Replace -99999 with NA in XR_MAX
            data.at[data['XR_MAX'] == -99999,'XR_MAX'] = float('NaN')
                   
            data = self._NA_linear_interpolate(data) #Replaces full NA-rows with 0
            
            #save data_point
            pickle.dump((id,label,data),save_file)
            #save_file.write(str(data_point[1:-1]))
            
            if (id % 5000) == 0:
              print('Dataset: ' + dataset + ', Line: ' + str(id) + ' out of ' + str(self.n_lines[dataset]) + ', Time: ' + str(int(time.time()-p0)) + 's')
        if col in [
                "TOTUSJH", "TOTBSQ", "TOTPOT", "TOTUSJZ", "ABSNJZH", "SAVNCPP",
                "USFLUX", "XR_MAX"
        ]:
            df[col] = scale_series(df[col],
                                   lambda x: np.sign(x) * np.abs(x)**0.2)
        if col in ["MEANPOT"]:
            df[col] = scale_series(df[col],
                                   lambda x: np.sign(x) * np.abs(x)**0.1)
        if col in ["TOTFZ", "TOTFY", "TOTFX"]:
            df[col] = scale_series(df[col],
                                   lambda x: np.sign(x) * np.abs(x)**0.5)

    return df


def scale_series(series, f):
    return np.fromiter((f(x) for x in series), series.dtype)


# [sign(x) * abs(x)^(1/5) for x in  ["TOTUSJH", "TOTBSQ", "TOTPOT", "TOTUSJZ", "ABSNJZH", "SAVNCPP", "USFLUX", "XR_MAX"]]
# [sign(x) * abs(x)^(1/10) for x in  ["MEANPOT"]]
# [sign(x) * abs(x)^(1/2) for x in  ["TOTFZ", "TOTFY", "TOTFX"]]

if __name__ == '__main__':
    from _0_DataCreation.Read_Data import load_data, fn
    id, label, df = next(load_data(filename=fn, max_row=1))
    df2 = scale_df(df)
    print(df)
    print(all(df == df2))
示例#5
0
rand_id = random.randint(1, 99)

n_ones = 10
n_zeros = 10

#Take random file and random index (remember the id's are already shuffled, so up to 100 id's should be fine)
time_series_ones = []
time_series_zeros = []
zero_count = 0
one_count = 0
begin_id = 0
while True:
    if one_count == 10 and zero_count == 10:
        break
    for id, label, data in load_data(filename=file_path, max_row=100):
        if id > begin_id:
            begin_id = id
            if label == 1 and one_count < 10:
                one_count += 1
                time_series_ones.append(data['pca_1'])
                break
            elif label == 0 and zero_count < 10:
                zero_count += 1
                time_series_zeros.append(data['pca_1'])
                break

time_series_ones = np.array(time_series_ones)
time_series_zeros = np.array(time_series_zeros)

plt.plot(np.arange(1, time_series_zeros.shape[1] + 1) / 5,
示例#6
0
def Load_Main_Dataset(Dataset: str, max_row=-1) -> Generator:
    return load_data(filename=Data_Path + '/' + Dataset + '_NA.dat', max_row=max_row)