def __init__(self, data): #super(BiLSTM_CRF, self).__init__() print "build batched lstmcrf..." ## add two more label for downlayer lstm, use original label size for CRF #label_size = data.label_alphabet_size self.label_alphabet=data.label_alphabet self.word_alphabet=data.word_alphabet #self.label_alphabet_size += 2 self.crf = CRF( algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=50, all_possible_transitions=True ) self.reformulator = Reformulator(data) self.useReformulator = False self.loss_function = nn.NLLLoss() self.topk=50 self.X_train=[] self.Y_train=[] self.tag_mask_list=[] self.instances=[] self.scores_refs=[] self.tag_mask=None
def train(file_path: str): """ Training CRF model from a given ``file_path`` """ addresses = [] with jsonlines.open(file_path) as reader: for obj in reader: addresses.append(obj) addresses_train, addresses_val = train_test_split(addresses, test_size=0.25, random_state=42) X_train, y_train = addresses_to_features(addresses_train) X_val, y_val = addresses_to_features(addresses_val) crf = CRF(c1=0.2, c2=0.2, max_iterations=100, all_possible_transitions=True) crf.fit(X_train, y_train) # prediction score on validation set y_pred = crf.predict(X_val) metrics.flat_f1_score(y_val, y_pred, average='weighted', labels=[l for l in LABELS if l != 'O']) return crf
class CRFModel(object): def __init__(self, algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False): self.model = CRF(algorithm=algorithm, c1=c1, c2=c2, max_iterations=max_iterations, all_possible_transitions=all_possible_transitions) def train(self, sentences, tag_lists, tagged=False): if tagged: features = [sent2features_tagged(s) for s in sentences] else: features = [sent2features(s) for s in sentences] self.model.fit(features, tag_lists) def test(self, sentences, tagged=False): if tagged: features = [sent2features_tagged(s) for s in sentences] pred_tag_lists = self.model.predict(features) else: features = [sent2features(s) for s in sentences] pred_tag_lists = self.model.predict(features) return pred_tag_lists
def parameter_tuning(args, dataset): c1s = experiment_util.get_param_list(args.c1) c2s = experiment_util.get_param_list(args.c2) best_valid_f1_score = -np.inf best_c1 = -np.inf best_c2 = -np.inf best_model = None for c1 in c1s: for c2 in c2s: crf = CRF(algorithm='lbfgs', c1=c1, c2=c2, max_iterations=500, all_possible_transitions=True, verbose=args.debug) crf.fit(dataset.training.list_of_feature_dicts, dataset.training.list_of_labels) preds = crf.predict(dataset.validation.list_of_feature_dicts) valid_f1_score = metrics.flat_f1_score( dataset.validation.list_of_labels, preds, average='micro') if valid_f1_score > best_valid_f1_score: best_valid_f1_score = valid_f1_score best_c1 = c1 best_c2 = c2 best_model = crf print('Best validation F1 score:', best_valid_f1_score, 'Best c1:', best_c1, 'Best c2:', best_c2) return best_model
def fit(self, train_data: Iterable[str], labels: Iterable[Iterable[str]]): """ :param train_data: :param labels: labels in BIO or BILOU notation :return: """ crf_dataset = self.__create_dataset(train_data, labels) features = [ self.__convert_idata_to_features(message_data) for message_data in crf_dataset ] labels = [ self.__extract_labels_from_data(message_data) for message_data in crf_dataset ] self.__crf_model = CRF( algorithm='lbfgs', c1=self.__CONFIG['L1_c'], c2=self.__CONFIG['L2_c'], max_iterations=self.__CONFIG['max_iterations'], all_possible_transitions=True, ) self.__crf_model.fit(features, labels) return self
def test_crf(train_file,test_file,model_name=""): valores = [] data=pandas.read_csv(train_file,sep="\t",header=None) X_dataset=fromListToTuple(data.iloc[:,[0,1,2,3]].values) useful_features=[True,True] X_train,y_train=prepareData([X_dataset],'train',useful_features) data2=pandas.read_csv(test_file,sep="\t",header=None) X_test=fromListToTuple(data2.iloc[:,[0,1,2]].values) X_teste,y_teste=prepareData([X_test],'predict',useful_features) crf = CRF( algorithm='lbfgs', c1=0.0625, c2=0.5, max_iterations=100, all_possible_transitions=False, all_possible_states=True, verbose=True ) crf.fit(X_train, y_train) if(model_name!=""): save_model(model_name + ".pickle",crf) useful_features=[True,True] data2=pandas.read_csv(test_file,sep="\t",header=None) X_test=fromListToTuple(data2.iloc[:,[0,1,2]].values) X_teste,y_teste=prepareData([X_test],'predict',useful_features) y_pred=crf.predict(X_teste) resultados = [] for index,elem in enumerate(y_pred[0]): resultados.append(str(y_pred[0][index])) return resultados
def train(train_file, test_file, min_freq, model_file): '''Train a CRF tagger based''' # Read in initial training data conll_data_train = read_conll_data(train_file) train_sents = [[line[0] for line in doc] for doc in conll_data_train] train_labels = [[line[2] for line in doc] for doc in conll_data_train] # Featurize and create instance from list of sentences feat_sent_train = build_dataset(train_sents) print("Training on {0} inst".format(len(feat_sent_train))) # Train and test loop for parameter settings # Create and train CRF model # For different parameter options, see: # https://sklearn-crfsuite.readthedocs.io/en/latest/_modules/sklearn_crfsuite/estimator.html model = CRF(min_freq=min_freq) model.fit(feat_sent_train, train_labels) # Test the model on held out test set if wanted if args.test_file: conll_data_test = read_conll_data(test_file) test_sents = [[line[0] for line in doc] for doc in conll_data_test] test_labels = [[line[2] for line in doc] for doc in conll_data_test] feat_sent_test = build_dataset(test_sents) # Predicting and printing accuracy pred = model.predict(feat_sent_test) acc = metrics.flat_accuracy_score(test_labels, pred) print("Accuracy: {0}%".format(float(round(acc, 3)) * 100)) # Save model to disk if wanted if args.model: print("Saving model to {0}".format(model_file)) joblib.dump(model, model_file)
def test_crf(xseq, yseq, algorithm): crf = CRF(algorithm) crf.fit([xseq], [yseq]) y_pred = crf.predict([xseq]) if algorithm != "ap": # Averaged Perceptron is regularized too much assert y_pred == [yseq]
def __init__(self, data): print("build batched lstmcrf...") self.label_alphabet=data.label_alphabet self.word_alphabet=data.word_alphabet self.crf = CRF( algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_states=False, all_possible_transitions=True ) self.examiner = Examiner(data) self.useExaminer = False self.loss_function = nn.NLLLoss() self.topk=5 self.X_train=[] self.Y_train=[] self.pos_mask_list=[] self.instances=[] self.scores_refs=[] self.pos_mask=None self.tag_size=data.label_alphabet_size
class CRFNER(object): """ A class to get reviews for products on Amazon """ def __init__(self, gazetteer, fraction=0.7): self.gazateer = gazetteer self.fraction = fraction def train(self, documents): self.data = ner_processing.NERFormatter(self.gazateer, documents) d_train, d_test = ner_processing.train_test_NER(self.data) self.X_train, self.X_test, self.y_train, self.y_test = crf_processing.feature_extraction( d_train, d_test) self.model = CRF(algorithm='lbfgs', c1=0.31, c2=0.02, max_iterations=100, all_possible_transitions=True) self.model.fit(self.X_train, self.y_train) def predict(self, sentence): """Transforms a single sentence (for NER testing) into a CRF-suite format""" sentence_split = nltk.word_tokenize(sentence) n_words = [0] * len(sentence_split) df_pred = pd.DataFrame({ 'word': sentence_split, 'sentence_no': n_words, 'category': n_words, 'POS': [x[-1] for x in nltk.pos_tag(sentence_split)], }) getter = crf_processing.SentenceGetter(df_pred) sent = getter.get_next() sentences = getter.sentences self.X = [crf_processing.sent2features(s) for s in sentences] return self.model.predict(self.X) def report(self): labels = list(self.model.classes_) y_pred = self.model.predict(self.X_test) print('F1 score {}'.format( metrics.flat_f1_score(self.y_test, y_pred, average='weighted', labels=labels))) sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) print( metrics.flat_classification_report(self.y_test, y_pred, labels=sorted_labels, digits=3))
def train(file_path: str, model_path: str = None): """ Training CRF model from a given ``file_path`` """ addresses = read_file(file_path) addresses_train, addresses_val = train_test_split(addresses, test_size=0.25, random_state=42) X_train, y_train = addresses_to_features(addresses_train) X_val, y_val = addresses_to_features(addresses_val) crf = CRF(c1=0.2, c2=0.2, max_iterations=100, all_possible_transitions=True) crf.fit(X_train, y_train) # prediction score on validation set y_pred = crf.predict(X_val) f1_score = metrics.flat_f1_score(y_val, y_pred, average="weighted", labels=[l for l in LABELS if l != "O"]) print("Flat F1-Score on validation set = {}".format(f1_score)) if model_path: joblib.dump(crf, model_path) print("Save model to {}".format(model_path)) return crf
def test_attributes(xseq, yseq): crf = CRF() assert crf.tagger_ is None assert crf.size_ is None assert crf.classes_ is None assert crf.num_attributes_ is None assert crf.attributes_ is None assert crf.state_features_ is None assert crf.transition_features_ is None crf.fit([xseq] * 20, [yseq] * 20) assert crf.tagger_ is not None assert crf.size_ > 1000 assert set(crf.classes_) == {"sunny", "rainy"} assert crf.num_attributes_ > 0 assert len(crf.attributes_) == crf.num_attributes_ assert all(crf.attributes_) assert "clean" in crf.attributes_ assert len(crf.state_features_) > 0 assert all(isinstance(c, float) for c in crf.state_features_.values()) assert all( attr in crf.attributes_ and label in crf.classes_ for (attr, label) in crf.state_features_.keys() ), crf.state_features_ assert len(crf.transition_features_) > 0 assert all(isinstance(c, float) for c in crf.transition_features_.values()) assert all( label_from in crf.classes_ and label_to in crf.classes_ for (label_from, label_to) in crf.transition_features_.keys() ), crf.transition_features_
def main(): df=pd.read_csv(args.input) tagged_sentence=Preparing_tagged_data(df) df=df[['ID','FORM','XPOSTAG']] #printing details printing_details(tagged_sentence) train_set, test_set = train_test_split(tagged_sentence,test_size=0.05,random_state=7) #print("Number of Sentences in Training Data ",len(train_set)) #print("Number of Sentences in Testing Data ",len(test_set)) X_train,y_train=prepareData(tagged_sentence) X_test,y_test=prepareData(test_set) crf = CRF( algorithm='l2sgd', c2=0.1, max_iterations=1000, all_possible_transitions=True) crf.fit(X_train, y_train) print(crf) print("Saving Model .....") # Save the Model to file in the current working directory Pkl_Filename = args.output with open(Pkl_Filename, 'wb') as file: pickle.dump(crf, file) print("Model Saved at "+ Pkl_Filename) print() print("Checking the Algoritham's Performance \n") TestData(crf, X_train,y_train,X_test,y_test)
def train_pos_tagger(self, path): # Just to make sure nltk.download('treebank') tagged_sentences = treebank.tagged_sents() train_size = int(.80 * len(tagged_sentences)) training_sentences = tagged_sentences[:train_size] X_train, y_train = self.transform_to_dataset(training_sentences) model = CRF() print('Training started...') model.fit(X_train, y_train) print('Training finished.') # Save classifier to file model_pkl = open(path, 'wb') pickle.dump(model, model_pkl) model_pkl.close() print("POSTagger saved.") self.classifier = model
def write_to_CoNLL(mdl_file_name, sentence2features, test_sentences, write_path): X_test_local = [] cond_rand_mdl = CRF(algorithm='lbfgs', c1=0.0001, c2=0.0001, max_iterations=100, all_possible_transitions=False, model_filename=mdl_file_name) if mdl_file_name[(len(mdl_file_name) - 1)] == '2': old_crf = CRF(algorithm='lbfgs', c1=0.0001, c2=0.0001, max_iterations=100, all_possible_transitions=False, model_filename=(mdl_file_name[:(len(mdl_file_name) - 1)]) + '1') X_test_local = [sent2features_second_guess(s, sentence2features, old_crf) for s in test_sentences] else: X_test_local = [sentence2features(s) for s in test_sentences] predictions = cond_rand_mdl.predict(X_test_local) with open(write_path, 'a') as f: for i in range(0, len(predictions)): sent = test_sentences[i] preds = predictions[i] for j in range(0, len(sent)): str_to_write = '{}\t{}\n'.format(sent[j][0], preds[j]) f.write(str_to_write) f.write('\n')
def entity_crf_train(my_subjects): for i in range(0, len(X)): for j in range(0, len(X[i])): if 'sub' in X[i][j]: subj = my_subjects[np.random.randint(len(my_subjects))] subj = subj.split() X[i] = X[i][:j] + subj + X[i][j + 1:] y[i] = y[i][:j] + ['subject'] * len(subj) + y[i][j + 1:] X[i] = X[i][0:10] y[i] = y[i][0:10] crf = CRF(c1=0.1, c2=0.01, max_iterations=200, all_possible_transitions=True) print(".....Training entity extraction model.....") crf.fit(X, y) print(".....Trained entity extraction model.....") working_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) with open(working_directory + '/full_model/crf_model.pkl', 'wb') as pickle_file: pickle.dump(crf, pickle_file, protocol=pickle.HIGHEST_PROTOCOL) with open(working_directory + '/full_model/subjects.pkl', 'wb') as pickle_file: pickle.dump(my_subjects, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
def test_attributes(xseq, yseq): crf = CRF() assert crf.tagger_ is None assert crf.size_ is None assert crf.classes_ is None assert crf.num_attributes_ is None assert crf.attributes_ is None assert crf.state_features_ is None assert crf.transition_features_ is None crf.fit([xseq] * 20, [yseq] * 20) assert crf.tagger_ is not None assert crf.size_ > 1000 assert set(crf.classes_) == {'sunny', 'rainy'} assert crf.num_attributes_ > 0 assert len(crf.attributes_) == crf.num_attributes_ assert all(crf.attributes_) assert 'clean' in crf.attributes_ assert len(crf.state_features_) > 0 assert all(isinstance(c, float) for c in crf.state_features_.values()) assert all(attr in crf.attributes_ and label in crf.classes_ for (attr, label) in crf.state_features_.keys()), crf.state_features_ assert len(crf.transition_features_) > 0 assert all(isinstance(c, float) for c in crf.transition_features_.values()) assert all(label_from in crf.classes_ and label_to in crf.classes_ for ( label_from, label_to) in crf.transition_features_.keys()), crf.transition_features_
class CRFBased: '''CRF based information retrieval. The model is similar to the Default model used in homework 2 and 3''' def __init__(self, load, n_train, n_test): self.load = load self.crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False) self.n_train = int(n_train) self.n_test = int(n_test) def load_data(self): X_train, y_train, X_test, y_test = prepare_crf_dataset( self.load, self.n_train, self.n_test) return X_train, y_train, X_test, y_test def fit(self, X, y): self.crf.fit(X, y) self.labels = list(self.crf.classes_) self.labels.remove('O') def predict(self, X): #dataset = Train or Test pred = self.crf.predict(X) return pred def evaluate(self, y_true, y_pred): print("Final Scores for CRF Based Modes:") print( flat_classification_report(y_pred=y_pred, y_true=y_true, labels=self.labels))
def build(sequences, labels, **kwargs): """ Builds a sequence classifier from x/y pairs :param sequences: A list of sequences, with each member of the sequence represented as features :type sequences: list of list of dict :param labels: The corresponding labels for each sequence :type labels: list of list of str :param kwargs: arguments to override the defaults given to the underlying CRF :return: A trained sequence classifier based on the provided training data :rtype: SequenceClassifier """ params = { 'algorithm': DEFAULT_ALGORITHM, 'c1': DEFAULT_C1, 'c2': DEFAULT_C2, 'max_iterations': DEFAULT_MAX_ITERATIONS, 'all_possible_transitions': DEFAULT_ALL_POSSIBLE_TRANSITIONS } if kwargs: params.update(kwargs) model = CRF(**params) model.fit(sequences, labels) return SequenceClassifier(model)
def test_sklearn_crfsuite(xseq, yseq): crf = CRF(c1=0.0, c2=0.1, max_iterations=50) crf.fit([xseq], [yseq]) expl = explain_weights(crf) text, html = format_as_all(expl, crf) assert "y='sunny' top features" in text assert "y='rainy' top features" in text assert "Transition features" in text assert "sunny -0.130 0.696" in text assert u'+0.124 солнце:не светит' in text html_nospaces = html.replace(' ', '').replace("\n", '') assert u'солнце:не светит' in html assert '<th>rainy</th><th>sunny</th>' in html_nospaces try: from eli5 import format_as_dataframe, format_as_dataframes except ImportError: pass else: from .test_formatters_as_dataframe import check_targets_dataframe df_dict = format_as_dataframes(expl) check_targets_dataframe(df_dict['targets'], expl) df_transition = df_dict['transition_features'] transition = expl.transition_features print(df_transition) assert list(transition.class_names) == ['rainy', 'sunny'] assert np.isclose(df_transition['rainy']['rainy'], transition.coef[0, 0]) assert np.isclose(df_transition['sunny']['rainy'], transition.coef[0, 1]) assert np.isclose(df_transition['rainy']['sunny'], transition.coef[1, 0])
def __init__(self, model_file_path): self.model_file_path = path.abspath(path.expanduser(model_file_path)) self.model = CRF(algorithm='l2sgd', c2=0.1, max_iterations=1000, all_possible_transitions=True, model_filename=self.model_file_path)
def __init__( self, hyper_params: Dict[str, float] = None, model_path: str = None, ): if model_path: self.load_model(model_path=model_path) else: algorithm = (hyper_params["algorithm"] if hyper_params and "algorithm" in hyper_params else "lbfgs") c1 = hyper_params[ "c1"] if hyper_params and "c1" in hyper_params else 0.1 c2 = hyper_params[ "c2"] if hyper_params and "c2" in hyper_params else 0.1 max_iters = (hyper_params["max_iterations"] if hyper_params and "max_iterations" in hyper_params else 100) apt = (hyper_params["all_possible trainsitions"] if hyper_params and "max_iterations" in hyper_params else True) self.fe = FeatureExtractor() self.crf = CRF( algorithm=algorithm, c1=c1, c2=c2, max_iterations=max_iters, all_possible_transitions=apt, )
def train(self, inputfile, features_names_list, annotation_column): """ This function fits a classification model as specified on training data :param inputfile: path to inputfile containing the training data :param features_names_list: list of indications of all feature columns that should be used :param annotation_column: indication of column with annotations :type inputfile: string :type features_names_list: list :type annotation_column: string """ # initialize the right model if self.modelname == 'logreg': self.model = LogisticRegression() elif self.modelname == 'naivebayes': self.model = BernoulliNB() elif self.modelname == 'svm': self.model = LinearSVC() elif self.modelname == 'crf': self.model = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) # store features_names_list as class attribute self.features_names_list = features_names_list # get training features and labels train_features = self.get_features(inputfile) train_targets = self.get_labels(inputfile, annotation_column) # fit the model self.model.fit(train_features, train_targets)
def train1(self, data, y, tag): #tagged_data = a.fit(a.tag(),y,tag) # Features as conditional random field accepts feaobj = Features(data, self.num_features) x_train, y_train = feaobj.get print("labelled data") # Using conditional random field as features crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False) print(crf) crf.fit(x_train, y_train) # Saving the model which is trained filename = 'finalized_model.sav' pickle.dump(crf, open(filename, 'wb')) # Prediction on train pred = crf.predict(x_train) # printing classification report and Accuracy print('\n \n Prediction On Trained Data:\n \n', flat_classification_report(y_train, pred)) print('Accuracy:', flat_accuracy_score(y_train, pred))
def main(path_train, path_test, path_pred, path_crf, take_first, dev_size): print("loading train corpus..") _, X_raw, y = load_corpus(path_train, take_first=take_first) print("extracting features from train corpus..") fe = TaggerFeatureExtractor() X = fe.fit_transform(tqdm(X_raw)) print("training..") crf = CRF(algorithm='ap', verbose=True, max_iterations=10) if dev_size: X, X_dev, y, y_dev = train_test_split(X, y, test_size=dev_size) else: X_dev, y_dev = None, None crf.fit(X, y, X_dev, y_dev) print("saving..") joblib.dump({'fe': fe, 'crf': crf}, path_crf, compress=2) print("loading test corpus..") corpus, X_test_raw, y_test = load_corpus(path_test) print("extracting features from test corpus..") X_test = fe.transform(X_test_raw) print("predicting..") y_pred = crf.predict(tqdm(X_test)) print("saving results..") sents_pred = y_pred_to_sents_pred(corpus, y_pred) conll.write_sents(sents_pred, path_pred)
def __init__(self, algo: str = 'lbfgs', min_freq: int = 0, all_states: bool = False, max_iter: int = 100, epsilon: float = 1e-5, delta: float = 1e-5): """ :param algo: optimization algorithm (lbfgs, l2sgd, ap, pa, arow) :param min_freq: threshold of ignoring feature :param all_states: if True, consider combinations of missing features and labels :param max_iter: max iteration size :param epsilon: learning rate :param delta: stop training threshold """ self._algo = algo self._min_freq = min_freq self._all_states = all_states self._max_iter = max_iter self._epsilon = epsilon self._delta = delta self.model = CRF(algorithm=algo, min_freq=min_freq, all_possible_states=all_states, max_iterations=max_iter, epsilon=epsilon, delta=delta)
def training_crf(training_cue, data, dataset): getter = get_frase(data) frases = getter.get_frase get_negaciones(data) X = [sent2features(f, training_cue) for f in frases] y = [sent2labels(f, training_cue) for f in frases] crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True, verbose=True) pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5) crf.fit(X, y) if training_cue == 'cue': model_filename = os.getcwd( ) + '/models/' + dataset + '/crf_cue_model.pkl' else: model_filename = os.getcwd( ) + '/models/' + dataset + '/crf_sco_model.pkl' with open(model_filename, 'wb') as file_model: pickle.dump(crf, file_model) return (y, pred, crf)
def test_crf(xseq, yseq, algorithm): crf = CRF(algorithm=algorithm) crf.fit([xseq], [yseq]) y_pred = crf.predict([xseq]) if algorithm != 'ap': # Averaged Perceptron is regularized too much assert y_pred == [yseq]
def train(self, model_name, tagged_sentences): # Split the dataset for training and testing cutoff = int(.75 * len(tagged_sentences)) training_sentences = tagged_sentences[:cutoff] test_sentences = tagged_sentences[cutoff:] X_train, y_train = transform_to_dataset(training_sentences) X_test, y_test = transform_to_dataset(test_sentences) print(len(X_train)) print(len(X_test)) print("Training Started........") print("it will take time according to your dataset size..") model = CRF() model.fit(X_train, y_train) print("Training Finished!") print("Evaluating with Test Data...") y_pred = model.predict(X_test) print("Accuracy is: ") print(metrics.flat_accuracy_score(y_test, y_pred)) pickle.dump(model, open(model_name, 'wb')) print("Model Saved!")
def __init__(self, train, dev, test): self.model = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False) self.X = np.array([self.sent2features(s) for s in sent]) self.y = np.array([self.sent2labels(s) for s in sent])
def __init__(self, is_save=False): self.crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) self.is_save = is_save self.save_model = "crf.model"
class CRFNerModel(object): def __init__(self, is_save=False): self.crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) self.is_save = is_save self.save_model = "crf.model" def fit(self, train_x, train_y): self.crf.fit(train_x, train_y) if self.is_save: self.dump_model() def predict(self, input_x): input_x = list(input_x) input_feature = [sent2features(input_x)] return self.crf.predict(input_feature) def dump_model(self): model_data = pickle.dumps(self.crf) with open(self.save_model, "wb") as f: f.write(model_data) def load_model(self): with open(self.save_model, "rb") as f: model_data = f.read() self.crf = pickle.loads(model_data) def predict_list(self, input_list): return self.crf.predict(input_list) def extract_ner(self, input_x): extract_ner = [] res = self.predict(input_x) start = None label = None for i, x in enumerate(res[0]): if x == "O": if start is not None: extract_ner.append((start, i, label, input_x[start:i])) start = None label = None else: xindex, xlabel = x.split("-") if xindex == "B": if start is not None: extract_ner.append((start, i, label, input_x[start:i])) start = i label = xlabel else: if label != xlabel: start = None label = None return extract_ner
def test_crf_score(xseq, yseq, algorithm): crf = CRF(algorithm) crf.fit([xseq], [yseq]) score = crf.score([xseq], [yseq]) if algorithm != "ap": assert score == 1.0 else: # Averaged Perceptron is regularized too much assert score > 0.8
def test_crf_pickling(xseq, yseq, algorithm): crf = CRF(algorithm=algorithm) crf.fit([xseq], [yseq]) data = pickle.dumps(crf, protocol=pickle.HIGHEST_PROTOCOL) crf2 = pickle.loads(data) score = crf2.score([xseq], [yseq]) if algorithm != "ap": assert score == 1.0 else: # Averaged Perceptron is regularized too much assert score > 0.8 assert crf2.algorithm == algorithm
def test_crf_verbose(xseq, yseq, algorithm, use_dev): crf = CRF(algorithm, verbose=True) if use_dev: X_dev, y_dev = [xseq], [yseq] else: X_dev, y_dev = None, None crf.fit(X=[xseq, xseq], y=[yseq, yseq], X_dev=X_dev, y_dev=y_dev) y_pred = crf.predict([xseq]) if algorithm != "ap": # Averaged Perceptron is regularized too much assert y_pred == [yseq]
def test_crf_marginals(xseq, yseq, algorithm): crf = CRF(algorithm) crf.fit([xseq], [yseq]) y_pred_marginals = crf.predict_marginals([xseq]) assert len(y_pred_marginals) == 1 marginals = y_pred_marginals[0] assert len(marginals) == len(yseq) labels = crf.tagger_.labels() for m in marginals: assert isinstance(m, dict) assert set(m.keys()) == set(labels) assert abs(sum(m.values()) - 1.0) < 1e-6
def test_crf_model_filename(xseq, yseq, tmpdir): path = os.path.join(str(tmpdir), "foo.crfsuite") assert not os.path.exists(path) # model file is created at a specified location crf = CRF(model_filename=path) crf.fit([xseq], [yseq]) assert os.path.exists(path) # it is possible to load the model just by passing a file name crf2 = CRF(model_filename=path) assert crf2.score([xseq], [yseq]) == 1.0 # crf is picklable data = pickle.dumps(crf, protocol=pickle.HIGHEST_PROTOCOL) crf3 = pickle.loads(data) assert crf3.score([xseq], [yseq]) == 1.0
def main(arg): X_train, y_train = transform_to_dataset(training_sentences,arg) X_test, y_test = transform_to_dataset(test_sentences,arg) print(len(X_train)) print(len(X_test)) print(X_train[0]) if arg['model_name']=="crf": model = CRF() model.fit(X_train, y_train) elif arg['model_name']=="SVM": v = DictVectorizer(sparse=False) X_tr = v.fit_transform(X_train) X_ts = v.fit_transform(X_test) sentence = ['I', 'am', 'Bob', '!']
def test_crf_dev_bad_arguments(xseq, yseq): crf = CRF() X = [xseq] * 20 y = [yseq] * 20 with pytest.raises(ValueError): crf.fit(X, y, X)
def test_predict_without_fit(xseq, algorithm): crf = CRF(algorithm) with pytest.raises(Exception): crf.predict([xseq])