Exemplo n.º 1
0
    def featurize_many(self,
                       entries,
                       ignore_errors=False,
                       return_errors=False,
                       pbar=True):
        """Featurize a list of entries.

        If `featurize` takes multiple inputs, supply inputs as a list of tuples.

        Featurize_many supports entries as a list, tuple, numpy array,
        Pandas Series, or Pandas DataFrame.

        Args:
            entries (list-like object): A list of entries to be featurized.
            ignore_errors (bool): Returns NaN for entries where exceptions are
                thrown if True. If False, exceptions are thrown as normal.
            return_errors (bool): If True, returns the feature list as
                determined by ignore_errors with traceback strings added
                as an extra 'feature'. Entries which featurize without
                exceptions have this extra feature set to NaN.
            pbar (bool): Show a progress bar for featurization if True.

        Returns:
            (list) features for each entry.
        """

        if return_errors and not ignore_errors:
            raise ValueError("Please set ignore_errors to True to use"
                             " return_errors.")

        # Check inputs
        if not isinstance(entries,
                          (tuple, list, np.ndarray, pd.Series, pd.DataFrame)):
            raise Exception("'entries' must be a list-like object")

        # Special case: Empty list
        if len(entries) is 0:
            return []

        # If the featurize function only has a single arg, zip the inputs
        if isinstance(entries, pd.DataFrame):
            entries = entries.values
        elif isinstance(
                entries,
                pd.Series) or not isinstance(entries[0],
                                             (tuple, list, np.ndarray)):
            entries = zip(entries)

        # Add a progress bar
        if pbar:
            if is_notebook():
                tqdm_func = tqdm.tqdm_notebook
            else:
                tqdm_func = tqdm.tqdm
            # list() required, tqdm has issues with memory if generator given
            entries = tqdm_func(list(entries), desc=self.__class__.__name__)

        # Run the actual featurization
        if self.n_jobs == 1:
            return [
                self.featurize_wrapper(x,
                                       ignore_errors=ignore_errors,
                                       return_errors=return_errors)
                for x in entries
            ]
        else:
            if sys.version_info[0] < 3:
                warnings.warn("Multiprocessing is not supported in "
                              "matminer for Python 2.x. Multiprocessing has "
                              "been disabled. Please upgrade to Python 3.x to "
                              "enable multiprocessing.")

                self.set_n_jobs(1)
                return self.featurize_many(entries,
                                           ignore_errors=ignore_errors,
                                           return_errors=return_errors,
                                           pbar=pbar)
            with Pool(self.n_jobs) as p:
                func = partial(self.featurize_wrapper,
                               return_errors=return_errors,
                               ignore_errors=ignore_errors)
                return p.map(func, entries, chunksize=self.chunksize)
Exemplo n.º 2
0
    def featurize_many(self, entries, ignore_errors=False, return_errors=False,
                       pbar=True):
        """Featurize a list of entries.

        If `featurize` takes multiple inputs, supply inputs as a list of tuples.

        Featurize_many supports entries as a list, tuple, numpy array,
        Pandas Series, or Pandas DataFrame.

        Args:
            entries (list-like object): A list of entries to be featurized.
            ignore_errors (bool): Returns NaN for entries where exceptions are
                thrown if True. If False, exceptions are thrown as normal.
            return_errors (bool): If True, returns the feature list as
                determined by ignore_errors with traceback strings added
                as an extra 'feature'. Entries which featurize without
                exceptions have this extra feature set to NaN.
            pbar (bool): Show a progress bar for featurization if True.

        Returns:
            (list) features for each entry.
        """

        if return_errors and not ignore_errors:
            raise ValueError("Please set ignore_errors to True to use"
                             " return_errors.")

        # Check inputs
        if not isinstance(entries, (tuple, list, np.ndarray, pd.Series, pd.DataFrame)):
            raise Exception("'entries' must be a list-like object")

        # Special case: Empty list
        if len(entries) is 0:
            return []

        # If the featurize function only has a single arg, zip the inputs
        if isinstance(entries, pd.DataFrame):
            entries = entries.values
        elif isinstance(entries, pd.Series) or not isinstance(entries[0], (tuple, list, np.ndarray)):
            entries = zip(entries)

        # Add a progress bar
        if pbar:
            if is_notebook():
                tqdm_func = tqdm.tqdm_notebook
            else:
                tqdm_func = tqdm.tqdm
            # list() required, tqdm has issues with memory if generator given
            entries = tqdm_func(list(entries), desc=self.__class__.__name__)

        # Run the actual featurization
        if self.n_jobs == 1:
            return [self.featurize_wrapper(x, ignore_errors=ignore_errors,
                                           return_errors=return_errors) for x in entries]
        else:
            if sys.version_info[0] < 3:
                warnings.warn("Multiprocessing is not supported in "
                              "matminer for Python 2.x. Multiprocessing has "
                              "been disabled. Please upgrade to Python 3.x to "
                              "enable multiprocessing.")

                self.set_n_jobs(1)
                return self.featurize_many(entries,
                                           ignore_errors=ignore_errors,
                                           return_errors=return_errors,
                                           pbar=pbar)
            with Pool(self.n_jobs) as p:
                func = partial(self.featurize_wrapper,
                               return_errors=return_errors,
                               ignore_errors=ignore_errors)
                return p.map(func, entries, chunksize=self.chunksize)