예제 #1
0
    def generate(self, dataset, dataset_name):
        """
        Given a `dataset` pandas DataFrame and string `dataset_name`,
        add column `"Features"` to the provided `pd.DataFrame` and serialize the result
        to the results folder listed in `config.py`.

        If a given featurizer exposes a :func:`featurize_batch` method, that method will be
        called to perform featurization.  Otherwise, :class:`Featurizer`'s will fall
        back to calling :func:`featurize` on each individual example.

        :param dataset: `pd.DataFrame` object that must contain a `Text` column.
        :param dataset_name: `str` name to use as a save location in the `config.FEATURES_DIRECTORY`.
        """

        if os.path.exists(feature_set_location(dataset_name, self.__class__.__name__)):
            print("Skipping, already have this feature combination.")
            return

        if type(dataset) == list:
            text = [d[0] for d in dataset]
            features = self._features_from_text(text)
            new_dataset = pd.DataFrame(data={
                "Text": text,
                "Targets": [d[1] for d in dataset],
                "Features": features
            })

        elif type(dataset) == pd.DataFrame:
            features = self._features_from_text(dataset["Text"])
            new_dataset = dataset.copy()  # Don't want to modify the underlying dataframe
            new_dataset['Features'] = features
        else:
            raise ValueError("Unrecognised data format!!")

        self._write(new_dataset, dataset_name)
예제 #2
0
    def generate(self, dataset, dataset_name):
        """
        Given a `dataset` pandas DataFrame and string `dataset_name`,
        add column `"Features"` to the provided `pd.DataFrame` and serialize the result
        to the results folder listed in `config.py`.

        If a given featurizer exposes a :func:`featurize_batch` method, that method will be
        called to perform featurization.  Otherwise, :class:`Featurizer`'s will fall
        back to calling :func:`featurize` on each individual example.

        :param dataset: `pd.DataFrame` object that must contain a `Text` column.
        :param dataset_name: `str` name to use as a save location in the `config.FEATURES_DIRECTORY`.
        """

        if os.path.exists(
                feature_set_location(dataset_name, self.__class__.__name__)):
            print("Skipping, already have this feature combination.")
            return

        if type(dataset) != dict:
            raise ValueError("dataset must be a dict")
        text = dataset['text']
        context = dataset['context']
        labels = dataset['labels']
        feats = [(t, c) for t, c in zip(text, context)]
        new_dataset = pd.DataFrame.from_dict({
            'Text': text,
            'Features': feats,
            'Targets': labels
        })
        self._write(new_dataset, dataset_name)
예제 #3
0
 def _write(self, featurized_dataset, dataset_name):
     """Responsible for taking a featurized dataset and writing it out to the filesystem."""
     dump_location = feature_set_location(dataset_name, self.__class__.__name__)
     joblib.dump(featurized_dataset, dump_location)
예제 #4
0
 def _load_dataset(dataset_name, featurizer_name):
     """Responsible for loading a given dataset given the dataset_name and featurizer."""
     read_location = feature_set_location(dataset_name, featurizer_name)
     logging.info("Loading Dataset: %s" % read_location)
     return joblib.load(read_location)