Пример #1
0
  def extract_features_test(self):
    '''
    This function creates the dataset using the 'load_data' function from 'Data.Read_Data'.
    It operates line-by-line so should be able to handle unlimited number of lines in data files.
    '''
    p0 = time.time()
    
    ### Calculate features one id at a time
    for dataset in self.datasets:
      load_file_path = Data_Path + '/' + dataset + '.dat'
      save_file_path = self.save_path + '/' + dataset + '_all_last.dat'
      
      #Delete existing
      print('save path: ' + save_file_path)
      if os.path.exists(save_file_path):
          os.remove(save_file_path)
          
      
      with open(save_file_path,'wb') as save_file:
        
        ##Create the header
        header = ['id']
        for feature in self.features:
          header += [variable + '_' + feature for variable in self.variables]
        self.header = header
        
        pickle.dump(header,save_file)
        
        ##Calculate data
        for id, data in load_data(filename = load_file_path, max_row = -1):
            
            #Extract subset
            data = np.array(scale_df(data)[self.variables])
              
            #Create data_point
            data_point = [id]
            for feature in self.features:
              feature_func = getattr(self, self.FEATURES[feature])
              data_point += list(feature_func(xs = data))

            #save data_point
            pickle.dump(data_point,save_file)
            #save_file.write(str(data_point[1:-1]))
            
            if (id % 5000) == 0:
              print('Dataset: ' + dataset + ', Line: ' + str(id) + ' out of ' + str(self.n_lines[dataset]) + ', Time: ' + str(int(time.time()-p0)) + 's')
Пример #2
0
  def scale_norm_and_pca_test(self):
    '''
    This function creates the dataset using the 'load_data' function from 'Data.Read_Data'.
    It operates line-by-line so should be able to handle unlimited number of lines in data files.
    '''
    p0 = time.time()
    
    ### Calculate features one id at a time
    for dataset in self.datasets:
      load_file_path = Data_Path + '/' + dataset + '.dat'
      save_file_path = self.save_path + '/' + dataset + '_pca.dat'
      save_file_path_no_pca = self.save_path + '/' + dataset + '_no_pca.dat'
      
      #Delete existing
      print('save path: ' + save_file_path)
      if os.path.exists(save_file_path):
          os.remove(save_file_path)

      #Delete existing
      print('save path: ' + save_file_path_no_pca)
      if os.path.exists(save_file_path_no_pca):
          os.remove(save_file_path_no_pca)          
      
      with open(save_file_path,'wb') as save_file, open(save_file_path_no_pca,'wb') as save_file_no_pca:
        
        ##Calculate data
        for id, data in load_data(filename = load_file_path, max_row = -1):
            
            #Scale
            scaled_data = scale_df(data)
            normalized_data = self._normalize(scaled_data)
            pca_data = self._pca(normalized_data)
             
            #save data_point
            pickle.dump((id,normalized_data),save_file_no_pca)
            pickle.dump((id,pca_data),save_file)
            
            if (id % 5000) == 0:
              print('Dataset: ' + dataset + ', Line: ' + str(id) + ' out of ' + str(self.n_lines[dataset]) + ', Time: ' + str(int(time.time()-p0)) + 's')
Пример #3
0
if __name__ == '__main__':
    from time import time

    # fn == filename

    # Test examples
    # labels = []
    # start = time()
    # for id, label, data in load_data(filename=fn2, max_row=-1):
    #     labels.append(label)
    #
    # print(time() - start)
    if True:
        for id, label, data in load_data(filename=fn, max_row=1):
            print('id={id} label={label}'.format(id=id, label=label))
            test = scale_df(data)
            print(type(data))
            print(data.columns)
            # print(print(type(data)))
            #test = data_to_keep(data, 10, list(range(6))+list(range(start=11, stop=16)))
            #print(test)
            # print(type(test))

    # print('\nRecursively iterate 3')
    # repeat_data = repeat_iter(load_data, kwargs=dict(filename=fn, max_row=3), n_times=-1)
    # for _ in range(10):
    #     id, label, data = next(repeat_data)
    #     print('id={id} label={label}'.format(id=id, label=label))
    #     print(data[-1, ])

    # fold1_df = load_dataframe(filename='fold1_extracted.dat')