def model_filename(tmpdir, xseq, yseq): from pycrfsuite import Trainer trainer = Trainer('lbfgs', verbose=False) trainer.append(xseq, yseq) model_filename = str(tmpdir.join('model.crfsuite')) trainer.train(model_filename) return model_filename
def test_trainer(tmpdir, xseq, yseq): trainer = Trainer('lbfgs') trainer.append(xseq, yseq) model_filename = str(tmpdir.join('model.crfsuite')) assert not os.path.isfile(model_filename) trainer.train(model_filename) assert os.path.isfile(model_filename)
def test_append_strstr_dicts(tmpdir): trainer = Trainer() trainer.append( [{'foo': 'bar'}, {'baz': False}, {'foo': 'bar', 'baz': True}, {'baz': 0.2}], ['spam', 'egg', 'spam', 'spam'] ) model_filename = str(tmpdir.join('model.crfsuite')) trainer.train(model_filename) with Tagger().open(model_filename) as tagger: info = tagger.info() assert set(info.attributes.keys()) == set(['foo:bar', 'baz']) assert info.state_features[('foo:bar', 'spam')] > 0
def train(self, training_data, classifier_path="classifier/cache/label_crf_classifier", c1=0, c2=10, period=300, minfreq=5): self.preprocess(training_data) train = Trainer() for i1, i in enumerate(self.x): train.append(ItemSequence(i), self.y[i1]) params = { "c1": c1, "c2": c2, "period": period, "feature.minfreq": minfreq, "max_iterations": 1000 # "calibration.eta": 0.05, # "calibration_samples": 400, } # train.select(algorithm = "l2sgd") train.set_params(params) train.train(classifier_path) self.tagger = Tagger() self.tagger.open(classifier_path)
def _train_and_save(self, X_train, y_train): trainer = Trainer(verbose=False) for i, (xseq, yseq) in enumerate(zip(X_train, y_train)): # Check how much memory left, stop adding more data if too little if i % 2500 == 0: if (psutil.virtual_memory().available / 1000000) < self.min_mb_available_memory: print('EntityExtractorWorker:_get_memory_safe_features - Less than {} Mb of memory remaining, breaking adding more data.'.format(self.min_mb_available_memory)) self.train_summary["warning"] = "Trained on {} documents, because more documents don't fit into memory".format(i) log_dict = { 'task': 'EntityExtractorWorker:_train_and_save', 'event': 'Less than {}Mb of memory available, stopping adding more training data. Iteration {}.'.format(self.min_mb_available_memory, i), 'data': {'task_id': self.task_id} } self.info_logger.info("Memory", extra=log_dict) break trainer.append(xseq, yseq) trainer.set_params({ 'c1': 0.5, # coefficient for L1 penalty 'c2': 1e-4, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier # transitions that are possible, but not observed 'feature.possible_transitions': True}) output_model_path = create_file_path(self.model_name, MODELS_DIR, self.task_type) # Train and save trainer.train(output_model_path) return trainer
def _train_and_save(self, X_train, y_train): trainer = Trainer(verbose=False) for i, (xseq, yseq) in enumerate(zip(X_train, y_train)): # Check how much memory left, stop adding more data if too little if i % 2500 == 0: if (psutil.virtual_memory().available / 1000000) < self.min_mb_available_memory: print('EntityExtractorWorker:_get_memory_safe_features - Less than {} Mb of memory remaining, breaking adding more data.'.format(self.min_mb_available_memory)) self.train_summary["warning"] = "Trained on {} documents, because more documents don't fit into memory".format(i) logging.getLogger(INFO_LOGGER).info(json.dumps({ 'process': 'EntityExtractorWorker:_train_and_save', 'event': 'Less than {}Mb of memory available, stopping adding more training data. Iteration {}.'.format(self.min_mb_available_memory, i), 'data': {'task_id': self.task_id} })) break trainer.append(xseq, yseq) trainer.set_params({ 'c1': 0.5, # coefficient for L1 penalty 'c2': 1e-4, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier # transitions that are possible, but not observed 'feature.possible_transitions': True}) output_model_path = create_file_path(self.model_name, MODELS_DIR, self.task_type) # Train and save trainer.train(output_model_path) return trainer
def test_help_invalid_parameter(): trainer = Trainer() trainer.select('l2sgd') # This segfaults without a workaround; # see https://github.com/chokkan/crfsuite/pull/21 with pytest.raises(ValueError): trainer.help('foo') with pytest.raises(ValueError): trainer.help('c1')
def test_append_nested_dicts(tmpdir): trainer = Trainer() trainer.append( [ { "foo": { "bar": "baz", "spam": 0.5, "egg": ["x", "y"], "ham": {"x": -0.5, "y": -0.1} }, }, { "foo": { "bar": "ham", "spam": -0.5, "ham": set(["x", "y"]) }, }, ], ['first', 'second'] ) model_filename = str(tmpdir.join('model.crfsuite')) trainer.train(model_filename) with Tagger().open(model_filename) as tagger: info = tagger.info() assert set(info.attributes.keys()) == set([ 'foo:bar:baz', 'foo:spam', 'foo:egg:x', 'foo:egg:y', 'foo:ham:x', 'foo:ham:y', 'foo:bar:ham', ]) for feat in ['foo:bar:baz', 'foo:spam', 'foo:egg:x', 'foo:egg:y']: assert info.state_features[(feat, 'first')] > 0 assert info.state_features.get((feat, 'second'), 0) <= 0 for feat in ['foo:bar:ham', 'foo:ham:x', 'foo:ham:y']: assert info.state_features[(feat, 'second')] > 0 assert info.state_features.get((feat, 'first'), 0) <= 0
def main(argv) : inputDir = argv[0] testDir = argv[1] outputFPath = argv[2] trainData = list(get_data(inputDir)) testData = list(get_data(testDir)) random.shuffle(trainData) # create features trainFeatures = create_features(trainData) testFeatures = create_features(testData) trainer = Trainer() for dialogue in trainFeatures : trainer.append(dialogue[0],dialogue[1]) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) trainer.train('./model.pkl') outputFile = open(outputFPath,'w') tagger = Tagger() tagger.open('./model.pkl') totalUtter=correctUtter=0 for dialogue in testFeatures : preds = tagger.tag(dialogue[0]) labels = dialogue[1] for i,pred in enumerate(preds) : outputFile.write(pred+'\n') if len(labels)>0 : totalUtter += 1 if labels[i]==pred : correctUtter += 1 outputFile.write('\n') if totalUtter > 0 : accuracy = correctUtter/totalUtter print('Accuracy: '+str(accuracy)) outputFile.close()
def train(features: pd.Series, labels: pd.Series) -> None: trainer = Trainer(verbose=False) features = features.tolist() labels = labels.tolist() for idx in range(len(features)): trainer.append(ItemSequence(features[idx]), literal_eval(labels[idx])) trainer.train('crf.model')
def train(features, labels): print("Training..") trainer = Trainer(verbose=False) features = features.tolist() labels = labels.tolist() for idx in range(0, len(features)): trainer.append(ItemSequence(features[idx]), literal_eval(labels[idx])) trainer.train('crf.model')
def train(X_train, X_test, y_train, y_test, **kwargs): ''' >>> corpus = CorpusReader('annot.opcorpora.xml') >>> X_train, x_test, y_train, y_test = get_train_data(corpus, test_size=0.33, random_state=42) >>> crf = train(X_train, X_test, y_train, y_test) ''' crf = Trainer() crf.set_params({ 'c1': 1.0, 'c2': 0.001, 'max_iterations': 200, }) for xseq, yseq in zip(X_train, y_train): crf.append(xseq, yseq) crf.train(model_name) return crf
def test_tag_formats(tmpdir, xseq, yseq): # make all coefficients 1 and check that results are the same model_filename = str(tmpdir.join('model.crfsuite')) xseq = [dict((key, 1) for key in x) for x in xseq] trainer = Trainer() trainer.set('c2', 1e-6) # make sure model overfits trainer.append(xseq, yseq) trainer.train(model_filename) with Tagger().open(model_filename) as tagger: assert tagger.tag(xseq) == yseq # strings with Tagger().open(model_filename) as tagger: data = [x.keys() for x in xseq] assert tagger.tag(data) == yseq
def train(X_train, y_train, **kwargs): ''' >>> corpus = CorpusReader('annot.opcorpora.xml') >>> X_train, x_test, y_train, y_test = get_train_data(corpus, test_size=0.33, random_state=42) >>> crf = train(X_train, y_train) ''' crf = Trainer() crf.set_params({ 'c1': 1.0, 'c2': 0.001, 'max_iterations': 200, 'feature.possible_transitions': True, }) for xseq, yseq in zip(X_train, y_train): crf.append(xseq, yseq) crf.train(PART_OF_SPEECH_MODEL_PATH) return crf
def train(self, docs: Iterable[Doc], algorithm: str, params: dict, path: str) -> None: trainer = Trainer(algorithm=algorithm, params=params, verbose=False) for doc in docs: for sentence in doc.sents: tokens = list(sentence) features = self.feature_extractor.extract( [str(token) for token in tokens]) labels = self.encoder.encode(tokens) trainer.append(features, labels) trainer.train(path) self.tagger.close() self.tagger.open(path)
def __init__(self, do_train=False, trained_model_name="passage_crf_model", algorithm="crf"): self.trained_model_name = trained_model_name self.fp = FeatureProcessing() self.do_train = do_train self.algorithm = algorithm if algorithm == "crf": if do_train: self.trainer = Trainer() else: self.tagger = Tagger() else: if do_train: model = ChainCRF() self.trainer = FrankWolfeSSVM(model=model) self.feat_index = {} self.label_index = {} else: self.tagger = pickle.load(open(self.trained_model_name, "rb")) self.feat_index = pickle.load(open("ssvm_feat_index.pkl", "rb")) label_index = pickle.load(open("ssvm_label_index.pkl", "rb")) self.rev_label_index = {i: x for x, i in label_index.items()}
def train_model(train_samples, model_name): """" 训练模型---全部数据拿来训练 :param train_samples: [DataSample1, DataSample2, ...] 训练数据 :param model_name: 保存模型的模型 :return: None """ train = Trainer() # append training samples into trainer for sample in train_samples: xseq = build_model_features(sample, 17, True) # yseq = sample.label yseq = sample.char_label train.append(xseq, yseq) train.train(model_name)
def test_append_nested_dicts(tmpdir): trainer = Trainer() trainer.append([ { "foo": { "bar": "baz", "spam": 0.5, "egg": ["x", "y"], "ham": { "x": -0.5, "y": -0.1 } }, }, { "foo": { "bar": "ham", "spam": -0.5, "ham": set(["x", "y"]) }, }, ], ['first', 'second']) model_filename = str(tmpdir.join('model.crfsuite')) trainer.train(model_filename) with Tagger().open(model_filename) as tagger: info = tagger.info() assert set(info.attributes.keys()) == set([ 'foo:bar:baz', 'foo:spam', 'foo:egg:x', 'foo:egg:y', 'foo:ham:x', 'foo:ham:y', 'foo:bar:ham', ]) for feat in ['foo:bar:baz', 'foo:spam', 'foo:egg:x', 'foo:egg:y']: assert info.state_features[(feat, 'first')] > 0 assert info.state_features.get((feat, 'second'), 0) <= 0 for feat in ['foo:bar:ham', 'foo:ham:x', 'foo:ham:y']: assert info.state_features[(feat, 'second')] > 0 assert info.state_features.get((feat, 'first'), 0) <= 0
def test_append_strstr_dicts(tmpdir): trainer = Trainer() trainer.append([{ 'foo': 'bar' }, { 'baz': False }, { 'foo': 'bar', 'baz': True }, { 'baz': 0.2 }], ['spam', 'egg', 'spam', 'spam']) model_filename = str(tmpdir.join('model.crfsuite')) trainer.train(model_filename) with Tagger().open(model_filename) as tagger: info = tagger.info() assert set(info.attributes.keys()) == set(['foo:bar', 'baz']) assert info.state_features[('foo:bar', 'spam')] > 0
def test_set_parameters_in_constructor(): trainer = Trainer(params={'c2': 100}) assert abs(trainer.get('c2') - 100) < 1e-6
def test_params_and_help(): trainer = Trainer() trainer.select('lbfgs') assert 'c1' in trainer.params() assert 'c2' in trainer.params() assert 'num_memories' in trainer.params() assert 'L1' in trainer.help('c1') trainer.select('l2sgd') assert 'c2' in trainer.params() assert 'c1' not in trainer.params() assert 'L2' in trainer.help('c2')
def test_get_parameter(): trainer = Trainer() trainer.select('l2sgd') assert abs(trainer.get('c2') - 0.1) > 1e-6 trainer.set('c2', 0.1) assert abs(trainer.get('c2') - 0.1) < 1e-6
def test_trainer_select_raises_error(): trainer = Trainer() with pytest.raises(ValueError): trainer.select('foo')
def test_algorithm_parameters(algo): trainer = Trainer(algo) params = trainer.get_params() assert params # set the same values trainer.set_params(params) params2 = trainer.get_params() assert params2 == params # change a value trainer.set('feature.possible_states', True) assert trainer.get_params()['feature.possible_states'] == True trainer.set('feature.possible_states', False) assert trainer.get_params()['feature.possible_states'] == False # invalid parameter params['foo'] = 5 with pytest.raises(ValueError): trainer.set_params(params)
def test_trainer_noselect_noappend(tmpdir): # This shouldn't segfault; see https://github.com/chokkan/crfsuite/pull/21 trainer = Trainer() model_filename = str(tmpdir.join('model.crfsuite')) trainer.train(model_filename)
def train(self, data=None, form_col=None, lbl_col=None, ilbl_col=None, data_cols=None, data_sep=None, dump=True): """Trains a model based on provided data and features. The default behaviour is to load training parameters from the global configuration, unless they are passed to this method. IMPORTANT: there are two ways to pass data directly through the `data` parameter: -- np.recarray `data` needs to be a recarray with column names that match what the feature extractor expects. -- csv str `data` needs to contain a TSV/CSV formatted string. Column names and separator should be provided in the `data_cols` and `data_sep` parameters. They should still match what is expected by the feature extractor. The observation, label, and inference column names can be set through the global configuration using the following parameter names: `form_col`, `label_col`, `guess_label_col`. The default observation column name is `fc`, and the inference column name is `guesstag`. All three names can be passed to this method to override global configuration. Any other column names need to match their respective feature extractor functions, e.g. part-of-speech tags need to be placed in `postag` column. See `ftex.FeatureTemplate` for others. RECOMMENDED: use `utils.parse_tsv` to parse input data to avoid column configuration errors. NOTE: Due to the way `pycrfsuite` works, the crfsuite model needs to be dumped on the hard drive, however, the CRFSuiteTagger model does not NEED to be dumped. That process is controlled through the `dump` parameter. :param data: training data :type data: np.recarray or str :param form_col: fc column name :type form_col: str :param lbl_col: label column name :type lbl_col: str :param ilbl_col: inference label column name :type ilbl_col: str :param data_cols: list of columns in the data :type data_cols: str :param data_sep: data tab separator :type data_sep: str :param dump: dumps the model at specified location if True :type dump: bool """ # overriding parameters fc = form_col if form_col else self.form_col c = data_cols if data_cols else self.cols sep = data_sep if data_sep else self.ts lc = lbl_col if lbl_col else self.lbl_col ilc = ilbl_col if ilbl_col else self.ilbl_col if type(data) in [np.core.records.recarray, np.ndarray]: d = data elif type(data) == str: d = parse_tsv(s=data, cols=c, ts=sep, inference_col=ilc) elif data is None: d = self.train_data else: raise ValueError('Invalid input type.') # extract features X = self._extract_features(d, fc) # extract labels y = gsequences(d, [lc]) trainer = Trainer(verbose=self.verbose) # setting CRFSuite parameters trainer.set_params(self.cfg_crf) for x_seq, y_seq in zip(X, y): trainer.append(x_seq, [l[0] for l in y_seq]) crfs_mp = '%s.crfs' % self.model_path try: makedirs(dirname(crfs_mp)) except OSError: pass trainer.train(crfs_mp) self.tagger = Tagger() self.tagger.open(crfs_mp) # dumps the model if dump: self.dump_model(self.model_path) pickle.dump(self.cfg, open('%s.cfg.pcl' % self.model_path, 'w'))