예제 #1
0
    def __init__(self,
                 model_filename=None,
                 train_args=None,
                 feature_template="# Label unigrams and bigrams:\n*\n",
                 unigrams_scope="u",
                 tempdir=None,
                 unlink_temp=True,
                 verbose=True,
                 feature_encoder=None,
                 dev_size=0):

        self.modelfile = FileResource(
            filename=model_filename,
            keep_tempfiles=not unlink_temp,
            suffix='.wapiti',
            prefix='model',
        )

        if train_args is None:
            train_args = '--algo l-bfgs --maxiter 50 --compact --nthread 8 --jobsize 1 --stopwin 15'
        if isinstance(train_args, (list, tuple)):
            self.train_args = train_args
        else:
            self.train_args = shlex.split(train_args)

        self.feature_template = feature_template
        self.unigrams_scope = unigrams_scope
        self.tempdir = tempdir
        self.unlink_temp = unlink_temp
        self.verbose = verbose
        self.dev_size = dev_size
        self._wapiti_model = None
        self.feature_encoder = feature_encoder or WapitiFeatureEncoder()
        super(WapitiCRF, self).__init__()
예제 #2
0
    def __init__(self, model_filename=None, train_args=None,
                 feature_template="# Label unigrams and bigrams:\n*\n",
                 unigrams_scope="u", tempdir=None, unlink_temp=True,
                 verbose=True, feature_encoder=None, dev_size=0):

        self.modelfile = FileResource(
            filename=model_filename,
            keep_tempfiles=not unlink_temp,
            suffix='.wapiti',
            prefix='model',
        )

        if train_args is None:
            train_args = '--algo l-bfgs --maxiter 50 --compact --nthread 8 --jobsize 1 --stopwin 15'
        if isinstance(train_args, (list, tuple)):
            self.train_args = train_args
        else:
            self.train_args = shlex.split(train_args)

        self.feature_template = feature_template
        self.unigrams_scope = unigrams_scope
        self.tempdir = tempdir
        self.unlink_temp = unlink_temp
        self.verbose = verbose
        self.dev_size = dev_size
        self._wapiti_model = None
        self.feature_encoder = feature_encoder or WapitiFeatureEncoder()
        super(WapitiCRF, self).__init__()
예제 #3
0
 def __init__(self,
              algorithm=None,
              train_params=None,
              verbose=False,
              model_filename=None,
              keep_tempfiles=False,
              trainer_cls=None):
     self.algorithm = algorithm
     self.train_params = train_params
     self.modelfile = FileResource(filename=model_filename,
                                   keep_tempfiles=keep_tempfiles,
                                   suffix=".crfsuite",
                                   prefix="model")
     self.verbose = verbose
     self._tagger = None
     if trainer_cls is None:
         import pycrfsuite
         self.trainer_cls = pycrfsuite.Trainer
     else:
         self.trainer_cls = trainer_cls
     self.training_log_ = None
     super(CRFsuiteCRF, self).__init__()
예제 #4
0
 def __init__(self, algorithm=None, train_params=None, verbose=False,
              model_filename=None, keep_tempfiles=False, trainer_cls=None):
     self.algorithm = algorithm
     self.train_params = train_params
     self.modelfile = FileResource(
         filename =model_filename,
         keep_tempfiles=keep_tempfiles,
         suffix=".crfsuite",
         prefix="model"
     )
     self.verbose = verbose
     self._tagger = None
     if trainer_cls is None:
         import pycrfsuite
         self.trainer_cls = pycrfsuite.Trainer
     else:
         self.trainer_cls = trainer_cls
     self.training_log_ = None
     super(CRFsuiteCRF, self).__init__()
예제 #5
0
class CRFsuiteCRF(BaseSequenceClassifier):
    def __init__(self,
                 algorithm=None,
                 train_params=None,
                 verbose=False,
                 model_filename=None,
                 keep_tempfiles=False,
                 trainer_cls=None):
        self.algorithm = algorithm
        self.train_params = train_params
        self.modelfile = FileResource(filename=model_filename,
                                      keep_tempfiles=keep_tempfiles,
                                      suffix=".crfsuite",
                                      prefix="model")
        self.verbose = verbose
        self._tagger = None
        if trainer_cls is None:
            import pycrfsuite
            self.trainer_cls = pycrfsuite.Trainer
        else:
            self.trainer_cls = trainer_cls
        self.training_log_ = None
        super(CRFsuiteCRF, self).__init__()

    def fit(self, X, y, X_dev=None, y_dev=None):
        """
        Train a model.

        Parameters
        ----------
        X : list of lists of dicts
            Feature dicts for several documents (in a python-crfsuite format).

        y : list of lists of strings
            Labels for several documents.

        X_dev : (optional) list of lists of dicts
            Feature dicts used for testing.

        y_dev : (optional) list of lists of strings
            Labels corresponding to X_dev.
        """
        if (X_dev is None and y_dev is not None) or (X_dev is not None
                                                     and y_dev is None):
            raise ValueError(
                "Pass both X_dev and y_dev to use the holdout data")

        if self._tagger is not None:
            self._tagger.close()
            self._tagger = None
        self.modelfile.refresh()

        trainer = self._get_trainer()

        for xseq, yseq in zip(X, y):
            trainer.append(xseq, yseq)

        if X_dev is not None:
            for xseq, yseq in zip(X_dev, y_dev):
                trainer.append(xseq, yseq, 1)

        trainer.train(self.modelfile.name, holdout=-1 if X_dev is None else 1)
        self.training_log_ = trainer.logparser
        return self

    def predict(self, X):
        """
        Make a prediction.

        Parameters
        ----------
        X : list of lists of dicts
            feature dicts in python-crfsuite format

        Returns
        -------
        y : list of lists
            predicted labels

        """
        y = []
        tagger = self.tagger
        for xseq in X:
            y.append(tagger.tag(xseq))
        return y

    @property
    def tagger(self):
        if self._tagger is None:
            if self.modelfile.name is None:
                raise Exception("Can't load model. Is the model trained?")

            import pycrfsuite
            tagger = pycrfsuite.Tagger()
            tagger.open(self.modelfile.name)
            self._tagger = tagger
        return self._tagger

    def _get_trainer(self):
        return self.trainer_cls(
            algorithm=self.algorithm,
            params=self.train_params,
            verbose=self.verbose,
        )

    def __getstate__(self):
        dct = self.__dict__.copy()
        dct['_tagger'] = None
        return dct
예제 #6
0
class WapitiCRF(BaseSequenceClassifier):
    """
    Class for training and applying Wapiti CRF models.

    For training it relies on calling original Wapiti binary (via
    subprocess), so "wapiti" binary must be available if you need "fit"
    method.

    Trained model is saved in an external file; its filename is a first
    parameter to constructor. This file is created and overwritten by
    :meth:`WapitiCRF.fit`; it must exist for :meth:`WapitiCRF.transform`
    to work.

    For prediction WapitiCRF relies on python-wapiti_ library.

    .. _python-wapiti: https://github.com/adsva/python-wapiti
    """

    WAPITI_CMD = 'wapiti'
    """ Command used to start wapiti """

    def __init__(self, model_filename=None, train_args=None,
                 feature_template="# Label unigrams and bigrams:\n*\n",
                 unigrams_scope="u", tempdir=None, unlink_temp=True,
                 verbose=True, feature_encoder=None, dev_size=0):

        self.modelfile = FileResource(
            filename=model_filename,
            keep_tempfiles=not unlink_temp,
            suffix='.wapiti',
            prefix='model',
        )

        if train_args is None:
            train_args = '--algo l-bfgs --maxiter 50 --compact --nthread 8 --jobsize 1 --stopwin 15'
        if isinstance(train_args, (list, tuple)):
            self.train_args = train_args
        else:
            self.train_args = shlex.split(train_args)

        self.feature_template = feature_template
        self.unigrams_scope = unigrams_scope
        self.tempdir = tempdir
        self.unlink_temp = unlink_temp
        self.verbose = verbose
        self.dev_size = dev_size
        self._wapiti_model = None
        self.feature_encoder = feature_encoder or WapitiFeatureEncoder()
        super(WapitiCRF, self).__init__()

    def fit(self, X, y, X_dev=None, y_dev=None, out_dev=None):
        """
        Train a model.

        Parameters
        ----------
        X : list of lists of dicts
            Feature dicts for several documents.

        y : a list of lists of strings
            Labels for several documents.

        X_dev : (optional) list of lists of feature dicts
            Data used for testing and as a stopping criteria.

        y_dev : (optional) list of lists of labels
            Labels corresponding to X_dev.

        out_dev : (optional) string
            Path to a file where tagged development data will be written.

        """
        self.modelfile.refresh()
        self._wapiti_model = None
        self.feature_encoder.reset()
        self.feature_encoder.fit(X, y)

        if any([X_dev, y_dev, out_dev]):
            if X_dev is None or y_dev is None:
                raise ValueError("Pass both X_dev and y_dev to use the development data")
        elif self.dev_size > 0:
            # Use a part of training data to help with stopping.
            # It means less data is used for training.
            X_dev, y_dev = X[:self.dev_size], y[:self.dev_size]
            X, y = X[self.dev_size:], y[self.dev_size:]

        dev_fn = None
        to_unlink = []
        try:
            train_fn = self._create_wapiti_data_file(X, y)
            to_unlink.append(train_fn)

            if X_dev is not None:
                dev_fn = self._create_wapiti_data_file(X_dev, y_dev)
                if out_dev is None:
                    _, out_dev = tempfile.mkstemp(dir=self.tempdir, suffix=".txt", prefix="wapiti-dev-data")
                    to_unlink.append(out_dev)

            template_fn = self._create_wapiti_feature_template_file()
            to_unlink.append(template_fn)

            # run wapiti training
            args = ['train', '--pattern', template_fn] + self.train_args
            if dev_fn:
                args += ['--devel', dev_fn]
            args += [train_fn, self.modelfile.name]
            self.run_wapiti(args)

            # do a final check on development data
            if dev_fn:
                args = ['label', '-m', self.modelfile.name, '--check', dev_fn, out_dev]
                self.run_wapiti(args)

        finally:
            if self.unlink_temp:
                for filename in to_unlink:
                    os.unlink(filename)

        return self

    def predict(self, X):
        """
        Make a prediction.

        Parameters
        ----------
        X : list of lists
            feature dicts

        Returns
        -------
        y : list of lists
            predicted labels

        """
        model = self._get_python_wapiti_model()
        sequences = self._to_wapiti_sequences(X)
        return [model.label_sequence(seq).splitlines() for seq in sequences]

    def run_wapiti(self, args):
        """ Run ``wapiti`` binary in a subprocess """
        return run_command([self.WAPITI_CMD] + args, self.verbose)

    def _get_python_wapiti_model(self):
        if self._wapiti_model is None:
            self._load_model()
        return self._wapiti_model

    def _load_model(self):
        import wapiti
        if self.modelfile.name is None:
            raise ValueError("model filename is unknown, can't load model")
        self._wapiti_model = wapiti.Model(model=self.modelfile.name)

    def _to_wapiti_sequences(self, X, y=None):
        X = self.feature_encoder.transform(X)
        if y is None:
            return ["\n".join(lines) for lines in X]
        else:
            return [
                self._to_train_sequence(lines, tags) for lines, tags in zip(X, y)
            ]

    def _create_wapiti_data_file(self, X, y=None):
        """
        Create a file with input data for wapiti. Return a resulting file name;
        caller should unlink the file.
        """
        with tempfile.NamedTemporaryFile('wb', prefix="wapiti-data-", suffix=".txt", dir=self.tempdir, delete=False) as fp:
            for seq in self._to_wapiti_sequences(X, y):
                fp.write(seq.encode('utf8'))
                fp.write(b"\n\n")
        return fp.name

    def _create_wapiti_feature_template_file(self):
        # create feature template
        with tempfile.NamedTemporaryFile('wb', prefix="feature-template-", suffix=".txt", dir=self.tempdir, delete=False) as fp:
            template = self.feature_encoder.prepare_template(self.feature_template)
            fp.write(template.encode('utf8'))

            if self.unigrams_scope is not None:
                unigram_template = self.feature_encoder.unigram_features_template(self.unigrams_scope)
                fp.write(b"\n")
                fp.write(unigram_template.encode('utf8'))
        return fp.name

    def _to_train_sequence(self, wapiti_lines, tags):
        return "\n".join(["%s %s" %(line, tag) for line, tag in zip(wapiti_lines, tags)])

    def __getstate__(self):
        dct = self.__dict__.copy()
        dct['_wapiti_model'] = None
        return dct
예제 #7
0
class WapitiCRF(BaseSequenceClassifier):
    """
    Class for training and applying Wapiti CRF models.

    For training it relies on calling original Wapiti binary (via
    subprocess), so "wapiti" binary must be available if you need "fit"
    method.

    Trained model is saved in an external file; its filename is a first
    parameter to constructor. This file is created and overwritten by
    :meth:`WapitiCRF.fit`; it must exist for :meth:`WapitiCRF.transform`
    to work.

    For prediction WapitiCRF relies on python-wapiti_ library.

    .. _python-wapiti: https://github.com/adsva/python-wapiti
    """

    WAPITI_CMD = 'wapiti'
    """ Command used to start wapiti """
    def __init__(self,
                 model_filename=None,
                 train_args=None,
                 feature_template="# Label unigrams and bigrams:\n*\n",
                 unigrams_scope="u",
                 tempdir=None,
                 unlink_temp=True,
                 verbose=True,
                 feature_encoder=None,
                 dev_size=0):

        self.modelfile = FileResource(
            filename=model_filename,
            keep_tempfiles=not unlink_temp,
            suffix='.wapiti',
            prefix='model',
        )

        if train_args is None:
            train_args = '--algo l-bfgs --maxiter 50 --compact --nthread 8 --jobsize 1 --stopwin 15'
        if isinstance(train_args, (list, tuple)):
            self.train_args = train_args
        else:
            self.train_args = shlex.split(train_args)

        self.feature_template = feature_template
        self.unigrams_scope = unigrams_scope
        self.tempdir = tempdir
        self.unlink_temp = unlink_temp
        self.verbose = verbose
        self.dev_size = dev_size
        self._wapiti_model = None
        self.feature_encoder = feature_encoder or WapitiFeatureEncoder()
        super(WapitiCRF, self).__init__()

    def fit(self, X, y, X_dev=None, y_dev=None, out_dev=None):
        """
        Train a model.

        Parameters
        ----------
        X : list of lists of dicts
            Feature dicts for several documents.

        y : a list of lists of strings
            Labels for several documents.

        X_dev : (optional) list of lists of feature dicts
            Data used for testing and as a stopping criteria.

        y_dev : (optional) list of lists of labels
            Labels corresponding to X_dev.

        out_dev : (optional) string
            Path to a file where tagged development data will be written.

        """
        self.modelfile.refresh()
        self._wapiti_model = None
        self.feature_encoder.reset()
        self.feature_encoder.fit(X, y)

        if any([X_dev, y_dev, out_dev]):
            if X_dev is None or y_dev is None:
                raise ValueError(
                    "Pass both X_dev and y_dev to use the development data")
        elif self.dev_size > 0:
            # Use a part of training data to help with stopping.
            # It means less data is used for training.
            X_dev, y_dev = X[:self.dev_size], y[:self.dev_size]
            X, y = X[self.dev_size:], y[self.dev_size:]

        dev_fn = None
        to_unlink = []
        try:
            train_fn = self._create_wapiti_data_file(X, y)
            to_unlink.append(train_fn)

            if X_dev is not None:
                dev_fn = self._create_wapiti_data_file(X_dev, y_dev)
                if out_dev is None:
                    _, out_dev = tempfile.mkstemp(dir=self.tempdir,
                                                  suffix=".txt",
                                                  prefix="wapiti-dev-data")
                    to_unlink.append(out_dev)

            template_fn = self._create_wapiti_feature_template_file()
            to_unlink.append(template_fn)

            # run wapiti training
            args = ['train', '--pattern', template_fn] + self.train_args
            if dev_fn:
                args += ['--devel', dev_fn]
            args += [train_fn, self.modelfile.name]
            self.run_wapiti(args)

            # do a final check on development data
            if dev_fn:
                args = [
                    'label', '-m', self.modelfile.name, '--check', dev_fn,
                    out_dev
                ]
                self.run_wapiti(args)

        finally:
            if self.unlink_temp:
                for filename in to_unlink:
                    os.unlink(filename)

        return self

    def predict(self, X):
        """
        Make a prediction.

        Parameters
        ----------
        X : list of lists
            feature dicts

        Returns
        -------
        y : list of lists
            predicted labels

        """
        model = self._get_python_wapiti_model()
        sequences = self._to_wapiti_sequences(X)
        return [model.label_sequence(seq).splitlines() for seq in sequences]

    def run_wapiti(self, args):
        """ Run ``wapiti`` binary in a subprocess """
        return run_command([self.WAPITI_CMD] + args, self.verbose)

    def _get_python_wapiti_model(self):
        if self._wapiti_model is None:
            self._load_model()
        return self._wapiti_model

    def _load_model(self):
        import wapiti
        if self.modelfile.name is None:
            raise ValueError("model filename is unknown, can't load model")
        self._wapiti_model = wapiti.Model(model=self.modelfile.name)

    def _to_wapiti_sequences(self, X, y=None):
        X = self.feature_encoder.transform(X)
        if y is None:
            return ["\n".join(lines) for lines in X]
        else:
            return [
                self._to_train_sequence(lines, tags)
                for lines, tags in zip(X, y)
            ]

    def _create_wapiti_data_file(self, X, y=None):
        """
        Create a file with input data for wapiti. Return a resulting file name;
        caller should unlink the file.
        """
        with tempfile.NamedTemporaryFile('wb',
                                         prefix="wapiti-data-",
                                         suffix=".txt",
                                         dir=self.tempdir,
                                         delete=False) as fp:
            for seq in self._to_wapiti_sequences(X, y):
                fp.write(seq.encode('utf8'))
                fp.write(b"\n\n")
        return fp.name

    def _create_wapiti_feature_template_file(self):
        # create feature template
        with tempfile.NamedTemporaryFile('wb',
                                         prefix="feature-template-",
                                         suffix=".txt",
                                         dir=self.tempdir,
                                         delete=False) as fp:
            template = self.feature_encoder.prepare_template(
                self.feature_template)
            fp.write(template.encode('utf8'))

            if self.unigrams_scope is not None:
                unigram_template = self.feature_encoder.unigram_features_template(
                    self.unigrams_scope)
                fp.write(b"\n")
                fp.write(unigram_template.encode('utf8'))
        return fp.name

    def _to_train_sequence(self, wapiti_lines, tags):
        return "\n".join(
            ["%s %s" % (line, tag) for line, tag in zip(wapiti_lines, tags)])

    def __getstate__(self):
        dct = self.__dict__.copy()
        dct['_wapiti_model'] = None
        return dct
예제 #8
0
class CRFsuiteCRF(BaseSequenceClassifier):
    def __init__(self, algorithm=None, train_params=None, verbose=False,
                 model_filename=None, keep_tempfiles=False, trainer_cls=None):
        self.algorithm = algorithm
        self.train_params = train_params
        self.modelfile = FileResource(
            filename =model_filename,
            keep_tempfiles=keep_tempfiles,
            suffix=".crfsuite",
            prefix="model"
        )
        self.verbose = verbose
        self._tagger = None
        if trainer_cls is None:
            import pycrfsuite
            self.trainer_cls = pycrfsuite.Trainer
        else:
            self.trainer_cls = trainer_cls
        self.training_log_ = None
        super(CRFsuiteCRF, self).__init__()

    def fit(self, X, y, X_dev=None, y_dev=None):
        """
        Train a model.

        Parameters
        ----------
        X : list of lists of dicts
            Feature dicts for several documents (in a python-crfsuite format).

        y : list of lists of strings
            Labels for several documents.

        X_dev : (optional) list of lists of dicts
            Feature dicts used for testing.

        y_dev : (optional) list of lists of strings
            Labels corresponding to X_dev.
        """
        if (X_dev is None and y_dev is not None) or (X_dev is not None and y_dev is None):
            raise ValueError("Pass both X_dev and y_dev to use the holdout data")

        if self._tagger is not None:
            self._tagger.close()
            self._tagger = None
        self.modelfile.refresh()

        trainer = self._get_trainer()

        for xseq, yseq in zip(X, y):
            trainer.append(xseq, yseq)

        if X_dev is not None:
            for xseq, yseq in zip(X_dev, y_dev):
                trainer.append(xseq, yseq, 1)

        trainer.train(self.modelfile.name, holdout=-1 if X_dev is None else 1)
        self.training_log_ = trainer.logparser
        return self

    def predict(self, X):
        """
        Make a prediction.

        Parameters
        ----------
        X : list of lists of dicts
            feature dicts in python-crfsuite format

        Returns
        -------
        y : list of lists
            predicted labels

        """
        y = []
        tagger = self.tagger
        for xseq in X:
            y.append(tagger.tag(xseq))
        return y

    @property
    def tagger(self):
        if self._tagger is None:
            if self.modelfile.name is None:
                raise Exception("Can't load model. Is the model trained?")

            import pycrfsuite
            tagger = pycrfsuite.Tagger()
            tagger.open(self.modelfile.name)
            self._tagger = tagger
        return self._tagger

    def _get_trainer(self):
        return self.trainer_cls(
            algorithm=self.algorithm,
            params=self.train_params,
            verbose=self.verbose,
        )

    def __getstate__(self):
        dct = self.__dict__.copy()
        dct['_tagger'] = None
        return dct