def read_file_name(self, new_file_name):
     """
     Checks if the file name provided is correct and, if so changes the reference file to look at.
         :param new_file_name: New file to be split later.
     """
     if self._file_exists(new_file_name) and self._extension_is_correct(new_file_name):
         self._file_name = new_file_name
     else:
         ppg.log_info("Registering default file_name", fmd.default_read_file)
         self._file_name = fmd.default_read_file
def read_metadata_file():
    """Reads the metadata file if it exists"""
    metadata = None
    if not os.path.isfile(META_DATA_FILE):
        ppg.log_info(
            "No metadata found. The earthquake splitting might have not been ran yet."
        )
    else:
        ppg.log_info("Found metadata file")
        metadata = pd.read_csv(META_DATA_FILE)
    return metadata
 def update_eq_metadata(self, data):
     """
     Computes the metadata for a signle earthquage and saves it in self._metadata
     :param data: Data frame containing one full earthquake.
     """
     ppg.log_info("Computing earthquake metadata")
     self._metadata.loc[self._nb_earthquake, 'size'] = len(data)
     self._metadata.loc[self._nb_earthquake, 'max'] = data.iloc[:, fmd.Column.DATA.value].max()
     self._metadata.loc[self._nb_earthquake, 'min'] = data.iloc[:, fmd.Column.DATA.value].min()
     self._metadata.loc[self._nb_earthquake, 'mean'] = data.iloc[:, fmd.Column.DATA.value].mean()
     self._metadata.loc[self._nb_earthquake, 'stdev'] = data.iloc[:, fmd.Column.DATA.value].std()
     self._metadata.loc[self._nb_earthquake, 'sum_of_sq'] = data.iloc[:, fmd.Column.DATA.value].pow(2).sum()
     self._nb_earthquake += 1
    def split_file(self):
        """
        Goes through the file designated by self.file_name. Identifies earthquakes and save them in separated files
        named "self.output_file_name+earthquake_number+.csv".
        Extract metadata from the original file: size, mean, standard_deviation,...
        """

        chunk_size = 10**6  # Number of lines to be read from the raw file at once. Reduce to spare RAM
        buffer = None       # Stores earthquakes before they are completed

        # Iteration variables
        self._nb_earthquake = 0
        first_iteration = True
        i = 0

        for chunk in pd.read_csv(self.file_name, chunksize=chunk_size):
            ppg.log_frivolity("Iteration", i)
            i += 1
            chunk.dropna(inplace=True)
            # Checks data format once
            if first_iteration:
                if not self._data_is_correct(chunk):
                    ppg.mock_error("Incorrect data format. Abort splitting.")
                    return
                else:
                    ppg.log_info("Data format correct.")
                    first_iteration = False

            # Checks if the earthquake occurred between previous and current chunk
            if buffer is not None and self._is_split_on_eq(buffer, chunk):
                self._save_eq(buffer)
                buffer = None

            # Complete the buffer
            before_eq, after_eq = self._split_on_eq(chunk)
            if buffer is None:
                buffer = before_eq.copy()
            else:
                buffer = pd.concat([buffer, before_eq])

            # Save the buffer if there have been an earthquake
            if after_eq is not None:
                self._save_eq(buffer)
                buffer = after_eq.copy()

        self._metadata.to_csv(fmd.META_DATA_FILE, index=False)
    def update_global_metadata(self):
        """
        Once the metadata for each earthquake is filled up, extrapolates the metadata for the whole original file.
        """
        ppg.log_info("Computing global metadata.")

        self._metadata.loc['global', 'size'] = self._metadata.loc[:, 'size'].sum()
        self._metadata.loc['global', 'max'] = self._metadata.loc[:, 'max'].max()
        self._metadata.loc['global', 'min'] = self._metadata.loc[:, 'min'].min()

        # Trick to save ram while computing mean
        weighted_sum_means = self._metadata.loc[:, 'size'] * self._metadata.loc[:, 'mean']
        self._metadata.loc['global', 'mean'] = weighted_sum_means.sum()/self._metadata.loc['global', 'size']

        # Trick to save ram while computing global variance
        self._metadata.loc['global', 'stdev'] = self._metadata.loc[:, 'sum_of_sq']/self._metadata.loc['global', 'size']\
                                                - self._metadata.loc['global', 'mean'].pow(2)
        self._metadata.loc['global', 'stdev'] = self._metadata.loc['global', 'stdev'].pow(1/2)

        # Get rid of column for intermediary result
        self._metadata.drop(columns='sum_of_sq', inplace=True)