Exemplo n.º 1
0
 def __init__(self,
              algorithm=None,
              train_params=None,
              verbose=False,
              model_filename=None,
              keep_tempfiles=False,
              trainer_cls=None):
     self.algorithm = algorithm
     self.train_params = train_params
     self.modelfile = FileResource(filename=model_filename,
                                   keep_tempfiles=keep_tempfiles,
                                   suffix=".crfsuite",
                                   prefix="model")
     self.verbose = verbose
     self._tagger = None
     if trainer_cls is None:
         self.trainer_cls = pycrfsuite.Trainer
     else:
         self.trainer_cls = trainer_cls
     self.training_log_ = None
Exemplo n.º 2
0
 def __init__(self, algorithm=None, train_params=None, verbose=False,
              model_filename=None, keep_tempfiles=False, trainer_cls=None):
     self.algorithm = algorithm
     self.train_params = train_params
     self.modelfile = FileResource(
         filename =model_filename,
         keep_tempfiles=keep_tempfiles,
         suffix=".crfsuite",
         prefix="model"
     )
     self.verbose = verbose
     self._tagger = None
     if trainer_cls is None:
         self.trainer_cls = pycrfsuite.Trainer
     else:
         self.trainer_cls = trainer_cls
     self.training_log_ = None
Exemplo n.º 3
0
class CRF(object):
    def __init__(self, algorithm=None, train_params=None, verbose=False,
                 model_filename=None, keep_tempfiles=False, trainer_cls=None):
        self.algorithm = algorithm
        self.train_params = train_params
        self.modelfile = FileResource(
            filename =model_filename,
            keep_tempfiles=keep_tempfiles,
            suffix=".crfsuite",
            prefix="model"
        )
        self.verbose = verbose
        self._tagger = None
        if trainer_cls is None:
            self.trainer_cls = pycrfsuite.Trainer
        else:
            self.trainer_cls = trainer_cls
        self.training_log_ = None

    def fit(self, X, y, X_dev=None, y_dev=None):
        """
        Train a model.

        Parameters
        ----------
        X : list of lists of dicts
            Feature dicts for several documents (in a python-crfsuite format).

        y : list of lists of strings
            Labels for several documents.

        X_dev : (optional) list of lists of dicts
            Feature dicts used for testing.

        y_dev : (optional) list of lists of strings
            Labels corresponding to X_dev.
        """
        if (X_dev is None and y_dev is not None) or (X_dev is not None and y_dev is None):
            raise ValueError("Pass both X_dev and y_dev to use the holdout data")

        if self._tagger is not None:
            self._tagger.close()
            self._tagger = None
        self.modelfile.refresh()

        trainer = self._get_trainer()
        train_data = zip(X, y)

        if self.verbose:
            train_data = tqdm(train_data, "loading training data to CRFsuite", len(X), leave=True)

        for xseq, yseq in train_data:
            trainer.append(xseq, yseq)

        if self.verbose:
            print("")

        if X_dev is not None:
            test_data = zip(X_dev, y_dev)

            if self.verbose:
                test_data = tqdm(test_data, "loading dev data to CRFsuite", len(X_dev), leave=True)

            for xseq, yseq in test_data:
                trainer.append(xseq, yseq, 1)

            if self.verbose:
                print("")

        trainer.train(self.modelfile.name, holdout=-1 if X_dev is None else 1)
        self.training_log_ = trainer.logparser
        return self

    def predict(self, X):
        """
        Make a prediction.

        Parameters
        ----------
        X : list of lists of dicts
            feature dicts in python-crfsuite format

        Returns
        -------
        y : list of lists of strings
            predicted labels

        """
        return list(map(self.predict_single, X))

    def predict_single(self, xseq):
        """
        Make a prediction.

        Parameters
        ----------
        X : list of dicts
            feature dicts in python-crfsuite format

        Returns
        -------
        y : list of strings
            predicted labels

        """
        return self.tagger.tag(xseq)

    def predict_marginals(self, X):
        """
        Make a prediction.

        Parameters
        ----------
        X : list of lists of dicts
            feature dicts in python-crfsuite format

        Returns
        -------
        y : list of lists of dicts
            predicted probabilities for each label at each position

        """
        return list(map(self.predict_marginals_single, X))

    def predict_marginals_single(self, xseq):
        """
        Make a prediction.

        Parameters
        ----------
        X : list of dicts
            feature dicts in python-crfsuite format

        Returns
        -------
        y : list of dicts
            predicted probabilities for each label at each position

        """
        labels = self.tagger.labels()
        self.tagger.set(xseq)
        return [
            {label: self.tagger.marginal(label, i) for label in labels}
            for i in range(len(xseq))
        ]

    @property
    def tagger(self):
        if self._tagger is None:
            if self.modelfile.name is None:
                raise Exception("Can't load model. Is the model trained?")

            tagger = pycrfsuite.Tagger()
            tagger.open(self.modelfile.name)
            self._tagger = tagger
        return self._tagger

    def _get_trainer(self):
        return self.trainer_cls(
            algorithm=self.algorithm,
            params=self.train_params,
            verbose=self.verbose,
        )

    def __getstate__(self):
        dct = self.__dict__.copy()
        dct['_tagger'] = None
        return dct
Exemplo n.º 4
0
class CRF(object):
    def __init__(self,
                 algorithm=None,
                 train_params=None,
                 verbose=False,
                 model_filename=None,
                 keep_tempfiles=False,
                 trainer_cls=None):
        self.algorithm = algorithm
        self.train_params = train_params
        self.modelfile = FileResource(filename=model_filename,
                                      keep_tempfiles=keep_tempfiles,
                                      suffix=".crfsuite",
                                      prefix="model")
        self.verbose = verbose
        self._tagger = None
        if trainer_cls is None:
            self.trainer_cls = pycrfsuite.Trainer
        else:
            self.trainer_cls = trainer_cls
        self.training_log_ = None

    def fit(self, X, y, X_dev=None, y_dev=None):
        """
        Train a model.

        Parameters
        ----------
        X : list of lists of dicts
            Feature dicts for several documents (in a python-crfsuite format).

        y : list of lists of strings
            Labels for several documents.

        X_dev : (optional) list of lists of dicts
            Feature dicts used for testing.

        y_dev : (optional) list of lists of strings
            Labels corresponding to X_dev.
        """
        if (X_dev is None and y_dev is not None) or (X_dev is not None
                                                     and y_dev is None):
            raise ValueError(
                "Pass both X_dev and y_dev to use the holdout data")

        if self._tagger is not None:
            self._tagger.close()
            self._tagger = None
        self.modelfile.refresh()

        trainer = self._get_trainer()
        train_data = zip(X, y)

        if self.verbose:
            train_data = tqdm(train_data,
                              "loading training data to CRFsuite",
                              len(X),
                              leave=True)

        for xseq, yseq in train_data:
            trainer.append(xseq, yseq)

        if self.verbose:
            print("")

        if X_dev is not None:
            test_data = zip(X_dev, y_dev)

            if self.verbose:
                test_data = tqdm(test_data,
                                 "loading dev data to CRFsuite",
                                 len(X_dev),
                                 leave=True)

            for xseq, yseq in test_data:
                trainer.append(xseq, yseq, 1)

            if self.verbose:
                print("")

        trainer.train(self.modelfile.name, holdout=-1 if X_dev is None else 1)
        self.training_log_ = trainer.logparser
        return self

    def predict(self, X):
        """
        Make a prediction.

        Parameters
        ----------
        X : list of lists of dicts
            feature dicts in python-crfsuite format

        Returns
        -------
        y : list of lists of strings
            predicted labels

        """
        return list(map(self.predict_single, X))

    def predict_single(self, xseq):
        """
        Make a prediction.

        Parameters
        ----------
        X : list of dicts
            feature dicts in python-crfsuite format

        Returns
        -------
        y : list of strings
            predicted labels

        """
        return self.tagger.tag(xseq)

    def predict_marginals(self, X):
        """
        Make a prediction.

        Parameters
        ----------
        X : list of lists of dicts
            feature dicts in python-crfsuite format

        Returns
        -------
        y : list of lists of dicts
            predicted probabilities for each label at each position

        """
        return list(map(self.predict_marginals_single, X))

    def predict_marginals_single(self, xseq):
        """
        Make a prediction.

        Parameters
        ----------
        X : list of dicts
            feature dicts in python-crfsuite format

        Returns
        -------
        y : list of dicts
            predicted probabilities for each label at each position

        """
        labels = self.tagger.labels()
        self.tagger.set(xseq)
        return [{label: self.tagger.marginal(label, i)
                 for label in labels} for i in range(len(xseq))]

    @property
    def tagger(self):
        if self._tagger is None:
            if self.modelfile.name is None:
                raise Exception("Can't load model. Is the model trained?")

            tagger = pycrfsuite.Tagger()
            tagger.open(self.modelfile.name)
            self._tagger = tagger
        return self._tagger

    def _get_trainer(self):
        return self.trainer_cls(
            algorithm=self.algorithm,
            params=self.train_params,
            verbose=self.verbose,
        )

    def __getstate__(self):
        dct = self.__dict__.copy()
        dct['_tagger'] = None
        return dct