def shuffle(self): shuffle_count = self.chunk_count - 1 shuffle_counter = 0 for iteration in range(0, shuffle_count): chunk1_id = 2 + iteration % (self.chunk_count - 1) chunk2_id = random.randint(1, chunk1_id - 1) print(' Shuffling files {id1} <-> {id2} ..'.format(id1=chunk1_id, id2=chunk2_id)) chunk1_path = self.features_path + str(chunk1_id) + '.csv' chunk2_path = self.features_path + str(chunk2_id) + '.csv' chunk1 = pd.read_csv(chunk1_path, sep=";", low_memory=False) chunk2 = pd.read_csv(chunk2_path, sep=";", low_memory=False) # merge two chunks and shuffle the merged rows chunk1 = chunk1.append( other=chunk2, ignore_index=True).sample(frac=1).reset_index(drop=True) # save shuffled chunks border = int(len(chunk1.index) / 2) util.write(chunk1.ix[0:border, :], address=chunk1_path) util.write(chunk1.ix[border:, :], address=chunk2_path) shuffle_counter += 1 print( ' Feature files {id1} <-> {id2} shuffled ({c} of {t})'.format( id1=chunk1_id, id2=chunk2_id, c=shuffle_counter, t=shuffle_count)) return self
def save(self): """ Save the extracted features to file :return: """ util.write(self.features, address=self.config[const.FEATURE]) print(len(self.features.index), 'feature vectors are written to file')
def save_features(self): """ Save the extracted features to file :return: """ util.write(self.features, address=self.features_path) print(len(self.features.index), 'feature vectors are written to file')
def save_features(self, chunk_id=1): """ Save the extracted features to file :return: """ util.write(self.features, address=self.features_path + str(chunk_id) + '.csv') print(len(self.features.index), 'feature vectors are written to file')
def save(self): """ Save pre-processed data to files given in config :return: """ # Write pre-processed data to csv file util.write(self.obs, self.config[const.OBSERVED]) util.write(self.missing, self.config[const.OBSERVED_MISSING]) self.stations.to_csv(self.config[const.STATIONS], sep=';', index=False) self.grids.to_csv(self.config[const.GRIDS], sep=';', index=False) print('Data saved.') return self
def process(self): """ Load and PreProcess the data :return: """ iterators = dict() iterators['forecast'] = pd.read_csv(self.config[const.GRID_FORECAST], sep=';', iterator=True, low_memory=False, float_precision='round_trip') iterators['live'] = pd.read_csv(self.config[const.GRID_LIVE], sep=';', iterator=True, low_memory=False, float_precision='round_trip') iterators['history'] = (pd.read_csv(self.config[const.GRID_DATA], iterator=True, low_memory=False, chunksize=1500000, float_precision='round_trip')) id_map = self.get_grid_id_maps() # id_grid is for visualizing purpose using data-frame viewer # id_list = [id_map[grid_id] for grid_id in sorted(id_map.keys())] # id_grid = np.flipud(np.array(id_list).reshape((self.row, self.column), order='F')) collection = {measure: dict() for measure in self.get_measures()} # Add historical / live / forecast grid data to coarsened statistics for category, grid in iterators.items(): for i, chunk in enumerate(grid): print(' merge grid chunk (%s, %d) ..' % (category, i + 1)) chunk.rename(columns={'stationName': const.GID, 'wind_speed/kph': const.WSPD}, inplace=True) # convert wind speed and direction to polar values (x, y) chunk[const.WSPD], chunk[const.WDIR] = reform.wind_transform( speed=chunk[const.WSPD], direction=chunk[const.WDIR]) time_group = chunk.groupby([const.TIME]) # each time is a square of points for time, group in time_group: self.add(time, group, id_map, collection) # Add live grid data for measure in collection: collect = collection[measure] # build final data table sorted by time ascending data = list() for time, stats in sorted(collect.items()): values = [v / c if c > 0 else 0 for v, c in zip(stats['values'], stats['counts'])] data.append([time] + values) columns = self.get_columns() df = pd.DataFrame(data=data, columns=columns) df[const.TIME] = pd.to_datetime(df[const.TIME], utc=True) # group_hours = self.config[const.GROUP_HOURS] # run_average_df = times.running_average_df(df=df, time_key=const.TIME, value_keys=columns[1:], # group_hours=group_hours, direction=1, whole_group=False) print('%d x (%d, %d) coarsened grid generated for (%s, %s)' % ( len(collect), self.sample_row, self.sample_column, self.city, measure)) util.write(df, self.config[const.GRID_COARSE] % measure) return self
def save_test(self, predicted_values): augmented_test = util.add_columns(self._test, columns=predicted_values, name_prefix='f') util.write(augmented_test, address=self.test_path) print(len(augmented_test.index), 'predicted tests are written to file')
def save_test(self, predicted_values): augmented_test = util.add_columns(self._test, columns=predicted_values, name_prefix='f') test_path = self.config[const.FEATURE_DIR] + self.feature_indicator + \ str(self.time_steps) + '_lstm_tests.csv' util.write(augmented_test, address=test_path) print(len(augmented_test.index), 'predicted tests are written to file')