def pd_read(self, relative_path, **kwargs): """Return a `Pandas <https://pandas.pydata.org>`_ \ `DataFrame <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_ for the data file \ located at `relative_path`. This method wraps pandas.read_csv() and accept the same keyword arguments. The following arguments will be ignored (because they are set appropriately for the data file): `delimiter`, `skiprows`, `header` and `names`. :param relative_path: path to the data file (relative to the archive root). :type relative_path: str :raises: `ImportError` if Pandas is not installed. :raises: :class:`dwca.exceptions.NotADataFile` if `relative_path` doesn't designate a valid data file\ in the archive. .. warning:: You'll need to `install Pandas <http://pandas.pydata.org/pandas-docs/stable/install.html>`_ before using this method. .. note:: Default values of Darwin Core Archive are supported: A column will be added to the DataFrame if a term has a default value in the Metafile (but no corresponding column in the CSV Data File). """ datafile_descriptor = self.get_descriptor_for(relative_path) if datafile_descriptor is None: raise NotADataFile() if not dwca.vendor._has_pandas: raise ImportError("Pandas is missing.") from pandas import read_csv kwargs['delimiter'] = datafile_descriptor.fields_terminated_by kwargs['skiprows'] = datafile_descriptor.lines_to_ignore kwargs['header'] = None kwargs['names'] = datafile_descriptor.short_headers df = read_csv(self.absolute_temporary_path(relative_path), **kwargs) # Add a column for default values, if present in the file descriptor for field in datafile_descriptor.fields: field_default_value = field['default'] if field_default_value is not None: df[shorten_term(field['term'])] = field_default_value return df
def __dataframe_read(self, relative_path, df_module='pandas', **kwargs): """Return a `Pandas <https://pandas.pydata.org>`_ \ `DataFrame <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_ for the data file \ located at `relative_path`. This method wraps pandas.read_csv() and accept the same keyword arguments. The following arguments will be ignored (because they are set appropriately for the data file): `delimiter`, `skiprows`, `header` and `names`. :param relative_path: path to the data file (relative to the archive root). :type relative_path: str :raises: `ImportError` if Pandas is not installed. :raises: :class:`dwca.exceptions.NotADataFile` if `relative_path` doesn't designate a valid data file\ in the archive. .. warning:: You'll need to `install Pandas <http://pandas.pydata.org/pandas-docs/stable/install.html>`_ before using this method. .. note:: Default values of Darwin Core Archive are supported: A column will be added to the DataFrame if a term has a default value in the Metafile (but no corresponding column in the CSV Data File). """ datafile_descriptor = self.get_descriptor_for(relative_path) # type: DataFileDescriptor if not dwca.vendor._has_pandas: raise ImportError("Pandas is missing.") if df_module == 'pandas': from pandas import read_csv elif df_module == 'dask': from dask.dataframe import read_csv kwargs['delimiter'] = datafile_descriptor.fields_terminated_by kwargs['skiprows'] = datafile_descriptor.lines_to_ignore kwargs['header'] = None names = {f['index']: shorten_term(f['term']) for f in datafile_descriptor.fields} # remove the coreid from the names to map the names correctly when there is no index_col # If you use dask dataframe, you cannot specify index_col parameter. df = read_csv(self.absolute_temporary_path(relative_path), **kwargs) if datafile_descriptor.represents_extension: names[datafile_descriptor.coreid_index] = 'coreid' if datafile_descriptor.represents_corefile: if 'id' not in names.values(): if datafile_descriptor.id_index in names.keys(): df[-1] = df[datafile_descriptor.id_index] # -1 for index else: names[datafile_descriptor.id_index] = 'id' names = dict(sorted(names.items())) df = df.loc[:, names.keys()] df.columns = names.values() # Add a column for default values, if present in the file descriptor for field in datafile_descriptor.fields: field_default_value = field['default'] if field_default_value is not None: df[shorten_term(field['term'])] = field_default_value return df