def read_file_name(self, new_file_name): """ Checks if the file name provided is correct and, if so changes the reference file to look at. :param new_file_name: New file to be split later. """ if self._file_exists(new_file_name) and self._extension_is_correct(new_file_name): self._file_name = new_file_name else: ppg.log_info("Registering default file_name", fmd.default_read_file) self._file_name = fmd.default_read_file
def read_metadata_file(): """Reads the metadata file if it exists""" metadata = None if not os.path.isfile(META_DATA_FILE): ppg.log_info( "No metadata found. The earthquake splitting might have not been ran yet." ) else: ppg.log_info("Found metadata file") metadata = pd.read_csv(META_DATA_FILE) return metadata
def update_eq_metadata(self, data): """ Computes the metadata for a signle earthquage and saves it in self._metadata :param data: Data frame containing one full earthquake. """ ppg.log_info("Computing earthquake metadata") self._metadata.loc[self._nb_earthquake, 'size'] = len(data) self._metadata.loc[self._nb_earthquake, 'max'] = data.iloc[:, fmd.Column.DATA.value].max() self._metadata.loc[self._nb_earthquake, 'min'] = data.iloc[:, fmd.Column.DATA.value].min() self._metadata.loc[self._nb_earthquake, 'mean'] = data.iloc[:, fmd.Column.DATA.value].mean() self._metadata.loc[self._nb_earthquake, 'stdev'] = data.iloc[:, fmd.Column.DATA.value].std() self._metadata.loc[self._nb_earthquake, 'sum_of_sq'] = data.iloc[:, fmd.Column.DATA.value].pow(2).sum() self._nb_earthquake += 1
def split_file(self): """ Goes through the file designated by self.file_name. Identifies earthquakes and save them in separated files named "self.output_file_name+earthquake_number+.csv". Extract metadata from the original file: size, mean, standard_deviation,... """ chunk_size = 10**6 # Number of lines to be read from the raw file at once. Reduce to spare RAM buffer = None # Stores earthquakes before they are completed # Iteration variables self._nb_earthquake = 0 first_iteration = True i = 0 for chunk in pd.read_csv(self.file_name, chunksize=chunk_size): ppg.log_frivolity("Iteration", i) i += 1 chunk.dropna(inplace=True) # Checks data format once if first_iteration: if not self._data_is_correct(chunk): ppg.mock_error("Incorrect data format. Abort splitting.") return else: ppg.log_info("Data format correct.") first_iteration = False # Checks if the earthquake occurred between previous and current chunk if buffer is not None and self._is_split_on_eq(buffer, chunk): self._save_eq(buffer) buffer = None # Complete the buffer before_eq, after_eq = self._split_on_eq(chunk) if buffer is None: buffer = before_eq.copy() else: buffer = pd.concat([buffer, before_eq]) # Save the buffer if there have been an earthquake if after_eq is not None: self._save_eq(buffer) buffer = after_eq.copy() self._metadata.to_csv(fmd.META_DATA_FILE, index=False)
def update_global_metadata(self): """ Once the metadata for each earthquake is filled up, extrapolates the metadata for the whole original file. """ ppg.log_info("Computing global metadata.") self._metadata.loc['global', 'size'] = self._metadata.loc[:, 'size'].sum() self._metadata.loc['global', 'max'] = self._metadata.loc[:, 'max'].max() self._metadata.loc['global', 'min'] = self._metadata.loc[:, 'min'].min() # Trick to save ram while computing mean weighted_sum_means = self._metadata.loc[:, 'size'] * self._metadata.loc[:, 'mean'] self._metadata.loc['global', 'mean'] = weighted_sum_means.sum()/self._metadata.loc['global', 'size'] # Trick to save ram while computing global variance self._metadata.loc['global', 'stdev'] = self._metadata.loc[:, 'sum_of_sq']/self._metadata.loc['global', 'size']\ - self._metadata.loc['global', 'mean'].pow(2) self._metadata.loc['global', 'stdev'] = self._metadata.loc['global', 'stdev'].pow(1/2) # Get rid of column for intermediary result self._metadata.drop(columns='sum_of_sq', inplace=True)