def pd_read(self, relative_path, **kwargs):
        """Return a `Pandas <https://pandas.pydata.org>`_ \
        `DataFrame <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_ for the data file \
        located at `relative_path`.

        This method wraps pandas.read_csv() and accept the same keyword arguments. The following arguments will be
        ignored (because they are set appropriately for the data file): `delimiter`, `skiprows`, `header` and `names`.

        :param relative_path: path to the data file (relative to the archive root).
        :type relative_path: str

        :raises: `ImportError` if Pandas is not installed.
        :raises: :class:`dwca.exceptions.NotADataFile` if `relative_path` doesn't designate a valid data file\
        in the archive.

        .. warning::

            You'll need to `install Pandas <http://pandas.pydata.org/pandas-docs/stable/install.html>`_ before using
            this method.

        .. note::

            Default values of Darwin Core Archive are supported: A column will be added to the DataFrame if a term has
            a default value in the Metafile (but no corresponding column in the CSV Data File).
        """
        datafile_descriptor = self.get_descriptor_for(relative_path)

        if datafile_descriptor is None:
            raise NotADataFile()

        if not dwca.vendor._has_pandas:
            raise ImportError("Pandas is missing.")

        from pandas import read_csv

        kwargs['delimiter'] = datafile_descriptor.fields_terminated_by
        kwargs['skiprows'] = datafile_descriptor.lines_to_ignore
        kwargs['header'] = None
        kwargs['names'] = datafile_descriptor.short_headers

        df = read_csv(self.absolute_temporary_path(relative_path), **kwargs)

        # Add a column for default values, if present in the file descriptor
        for field in datafile_descriptor.fields:
            field_default_value = field['default']
            if field_default_value is not None:
                df[shorten_term(field['term'])] = field_default_value

        return df
Пример #2
0
    def __dataframe_read(self, relative_path, df_module='pandas', **kwargs):
        """Return a `Pandas <https://pandas.pydata.org>`_ \
        `DataFrame <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_ for the data file \
        located at `relative_path`.

        This method wraps pandas.read_csv() and accept the same keyword arguments. The following arguments will be
        ignored (because they are set appropriately for the data file): `delimiter`, `skiprows`, `header` and `names`.

        :param relative_path: path to the data file (relative to the archive root).
        :type relative_path: str

        :raises: `ImportError` if Pandas is not installed.
        :raises: :class:`dwca.exceptions.NotADataFile` if `relative_path` doesn't designate a valid data file\
        in the archive.

        .. warning::

            You'll need to `install Pandas <http://pandas.pydata.org/pandas-docs/stable/install.html>`_ before using
            this method.

        .. note::

            Default values of Darwin Core Archive are supported: A column will be added to the DataFrame if a term has
            a default value in the Metafile (but no corresponding column in the CSV Data File).
        """
        datafile_descriptor = self.get_descriptor_for(relative_path)  # type: DataFileDescriptor

        if not dwca.vendor._has_pandas:
            raise ImportError("Pandas is missing.")

        if df_module == 'pandas':
            from pandas import read_csv
        elif df_module == 'dask':
            from dask.dataframe import read_csv

        kwargs['delimiter'] = datafile_descriptor.fields_terminated_by
        kwargs['skiprows'] = datafile_descriptor.lines_to_ignore
        kwargs['header'] = None
        names = {f['index']: shorten_term(f['term']) for f in datafile_descriptor.fields}
        # remove the coreid from the names to map the names correctly when there is no index_col
        # If you use dask dataframe, you cannot specify index_col parameter.
        df = read_csv(self.absolute_temporary_path(relative_path), **kwargs)
        if datafile_descriptor.represents_extension:
            names[datafile_descriptor.coreid_index] = 'coreid'

        if datafile_descriptor.represents_corefile:
            if 'id' not in names.values():
                if datafile_descriptor.id_index in names.keys():
                    df[-1] = df[datafile_descriptor.id_index]  # -1 for index
                else:
                    names[datafile_descriptor.id_index] = 'id'

        names = dict(sorted(names.items()))
        df = df.loc[:, names.keys()]
        df.columns = names.values()
        # Add a column for default values, if present in the file descriptor
        for field in datafile_descriptor.fields:
            field_default_value = field['default']
            if field_default_value is not None:
                df[shorten_term(field['term'])] = field_default_value

        return df