def test_bad(): with pytest.raises(ValueError): seq = pycrfsuite.ItemSequence('foo') print(seq.items()) with pytest.raises(ValueError): seq = pycrfsuite.ItemSequence([[{'foo': 'bar'}]]) print(seq.items())
def _add_point_to_model(self, srcid, trainer): if self.concatenate_sentences: sentence = self.merge_sentences(self.sentence_dict[srcid]) labels = self.merge_labels(self.label_dict[srcid]) assert len(sentence) == len(labels) trainer.append(pycrfsuite.ItemSequence( self._calc_features(sentence, None)), labels) else: for metadata_type, sentence in self.sentence_dict[srcid].items(): labels = self.label_dict[srcid][metadata_type] trainer.append(pycrfsuite.ItemSequence( self._calc_features(sentence, None)), labels)
def update_model(self, srcids): assert (len(self.source_buildings) == len(self.source_sample_num_list)) self.learning_srcids += srcids algo = 'ap' trainer = pycrfsuite.Trainer(verbose=False, algorithm=algo) if algo == 'ap': trainer.set('max_iterations', 125) #trainer.set('max_iterations', 200) # algorithm: {'lbfgs', 'l2sgd', 'ap', 'pa', 'arow'} trainer.set_params({ 'feature.possible_states': True, 'feature.possible_transitions': True }) for srcid in self.learning_srcids: for metadata_type, sentence in self.sentence_dict[srcid].items(): labels = self.label_dict[srcid][metadata_type] trainer.append( pycrfsuite.ItemSequence(self._calc_features( sentence, None)), labels) if self.use_brick_flag: for srcid in self.brick_srcids: sentence = self.brick_sentence_dict[srcid] labels = self.brick_label_dict[srcid] trainer.append( pycrfsuite.ItemSequence(self._calc_features( sentence, None)), labels) model_uuid = gen_uuid() crf_model_file = 'temp/{0}.{1}.model'.format(model_uuid, 'crfsuite') t0 = arrow.get() trainer.train(crf_model_file) t1 = arrow.get() print('training crf took: {0}'.format(t1 - t0)) with open(crf_model_file, 'rb') as fp: model_bin = fp.read() model = { # 'source_list': sample_dict, 'gen_time': arrow.get().datetime, 'use_cluster_flag': self.use_cluster_flag, 'use_brick_flag': self.use_brick_flag, 'model_binary': BsonBinary(model_bin), 'source_building_count': len(self.source_buildings), 'learning_srcids': sorted(set(self.learning_srcids)), 'uuid': model_uuid, 'crftype': 'crfsuite' } store_model(model) os.remove(crf_model_file) self.model_uuid = model_uuid
def func_advanced(dialog): features = [] for index, utt in enumerate(dialog): feature = {} if index == 0: feature["FirstUtt"] = 1 if index > 0 and not(dialog[index].speaker == dialog[index-1].speaker): feature["Speaker_Changed"] = 1 if (utt.pos): tokens = [word.token for word in utt.pos] feature['Token'] = tokens lis_t = [word.pos for word in utt.pos] feature['PartOfSpeech'] = lis_t leng_t = len(utt.pos) feature['Length'] = leng_t feature['START_WITH'] = utt.pos[0].token bigrams = list(zip(tokens[:-1], tokens[1:])) lis_t1 = [x+"_"+y for x, y in bigrams] feature['BiGram'] = lis_t1 if(utt.pos[-1].token == '?'): feature['Statement'] = 'Question' else: feature['Statement'] = 'Answer' trigrams = list(zip(tokens[:-2], tokens[2:])) feature['TriGram'] = ["_".join(tri) for tri in trigrams] else: feature['Other'] = utt.text.strip("<>.,") features.append(feature) return pycrfsuite.ItemSequence(features)
def tag(data, model_file, class_id): warnings.warn('Use non-static `annotate` instead', DeprecationWarning) """ :type data: nalaf.structures.data.Dataset :type model_file: str """ tagger = pycrfsuite.Tagger() try: tagger.open(model_file) for sentence in data.sentences(): labels = tagger.tag( pycrfsuite.ItemSequence(token.features for token in sentence)) for token_index in range(len(sentence)): label = labels[token_index] sentence[token_index].predicted_labels = [ Label(label, tagger.marginal(label, token_index)) ] data.form_predicted_annotations(class_id) finally: tagger.close()
def test_floatlists(): seq = pycrfsuite.ItemSequence([{ "w2v": FF([1., 2., 3.]) }, { "w2v": FF([-1., 5, 4.]) }]) assert len(seq) == 2 assert seq.items() == [{ "w2v:0": 1., "w2v:1": 2., "w2v:2": 3. }, { "w2v:0": -1., "w2v:1": 5., "w2v:2": 4. }] assert pycrfsuite.ItemSequence(seq.items()).items() == seq.items()
def test_dicts(): seq = pycrfsuite.ItemSequence([ { 'foo': True, 'bar': { 'foo': -1, 'baz': False } }, ]) assert len(seq) == 1 assert seq.items() == [{'foo': 1.0, 'bar:foo': -1, 'bar:baz': 0.0}]
def test_unicode(): seq = pycrfsuite.ItemSequence([ { 'foo': u'привет', u'ключ': 1.0, u'привет': u'мир' }, ]) assert seq.items() == [{ u'foo:привет': 1.0, u'ключ': 1.0, u'привет:мир': 1.0 }]
def annotate(self, corpus, class_id): """ :type corpus: nalaf.structures.data.Dataset :type class_id: str ~ to annotate with """ for sentence in corpus.sentences(): labels = self.tagger.tag(pycrfsuite.ItemSequence(token.features for token in sentence)) for token_index in range(len(sentence)): label = labels[token_index] sentence[token_index].predicted_labels = [Label(label, self.tagger.marginal(label, token_index))] corpus.form_predicted_annotations(class_id)
def test_nested(): seq = pycrfsuite.ItemSequence([ { "foo": { "bar": "baz", "spam": 0.5, "egg": ["x", "y"], "ham": { "x": -0.5, "y": -0.1 } }, }, { "foo": { "bar": "ham", "spam": -0.5, "ham": set(["x", "y"]) }, }, ]) assert len(seq) == 2 assert seq.items() == [{ 'foo:bar:baz': 1.0, 'foo:spam': 0.5, 'foo:egg:x': 1.0, 'foo:egg:y': 1.0, 'foo:ham:x': -0.5, 'foo:ham:y': -0.1, }, { 'foo:bar:ham': 1.0, 'foo:spam': -0.5, 'foo:ham:x': 1.0, 'foo:ham:y': 1.0, }] assert pycrfsuite.ItemSequence(seq.items()).items() == seq.items()
def train(data, model_file, params=None): """ :type data: nalaf.structures.data.Dataset :type model_file: str ~ filename (from local file system) to save trained model to. If None, no model is saved. """ trainer = pycrfsuite.Trainer() if params is not None: trainer.set_params(params) for sentence in data.sentences(): trainer.append(pycrfsuite.ItemSequence([token.features for token in sentence]), [token.original_labels[0].value for token in sentence]) # The CRFSuite library handles the "pickling" of the file; saves the model here trainer.train(model_file)
def annotate(self, corpus, class_id): """ :type corpus: nalaf.structures.data.Dataset :type class_id: str ~ to annotate with """ for sentence in corpus.sentences(): labels = self.tagger.tag(pycrfsuite.ItemSequence(token.features for token in sentence)) for token_index in range(len(sentence)): label = labels[token_index] try: sentence[token_index].predicted_labels = [Label(label, self.tagger.marginal(label, token_index))] except Exception as e: raise Exception("Exception when assining the predicted labels; likely a Multi-Thread problem", e) corpus.form_predicted_annotations(class_id)
def asSequence(self, fl): X = [] Y = [] for x in self: if not isinstance(x, list): return None sq_dt = [] sq_lbl = [] for xsq in x: sq_dt.append(xsq[0].getIndices2(fl)) sq_lbl.append(xsq[1]) iq = pycrfsuite.ItemSequence(sq_dt) X.append(iq) Y.append(sq_lbl) return X, Y
def feature(self, l_items): l_items_range = range(len(l_items)) ret = [] for wid, item in enumerate(l_items): d_feature = {} for name, l_rule, weight in self.template: subfeature = [] for field, offset in l_rule: if field == "bos": if wid == 0: subfeature.append("__BOS__") else: subfeature = None break elif field == "eos": if wid == len(l_items) - 1: subfeature.append("__EOS__") else: subfeature = None break else: p = wid + offset if p in l_items_range: temp_item = l_items[p] val = self._get_item(temp_item, field) subfeature.append(val) else: subfeature = None break if subfeature is not None: s = set(subfeature) if len(s) == 1: if len(s & set(self.ig_key)) == 1: weight = weight * self.ig_val fval = "|".join(subfeature) key = "=".join((name, fval)) d_feature[key] = weight #print(key, weight) ret.append(d_feature) return pycrfsuite.ItemSequence(ret)
"Train and test files are in 1 word per line format, w2v and c2v are pickles mapping a word or a " "character to its google embedding vector or char embedding.") exit() train, test = sys.argv[1], sys.argv[2] use_embeddings = len(sys.argv) == 5 if use_embeddings: w2v_vocab, w2v_weights = w2v_matrix_vocab_generator(sys.argv[3]) c2v_vocab, c2v_weights = w2v_matrix_vocab_generator(sys.argv[4]) train = get_data(train) test = get_data(test) trainer = pycrfsuite.Trainer(verbose=True) # TRAINING X_train = ([ pycrfsuite.ItemSequence(sent2features(s, use_embeddings)) for s in train ]) y_train = [sent2labels(s) for s in train] X_test = ([ pycrfsuite.ItemSequence(sent2features(s, use_embeddings)) for s in test ]) y_test = [sent2labels(s) for s in test] for xseq, yseq in zip(X_train, y_train): trainer.append(xseq, yseq) trainer.set_params({ 'c1': 0.0, # coefficient for L1 penalty
sys.exit(1) # Check the testing file exists or not? if not os.path.exists(path): print "The testing file \'%s\' does not exists. Try again!" % path sys.exit(1) elif not os.path.isfile(path): print "The testing file \'%s\' is not a file. Try again!" % path sys.exit(1) # Loading testing file print "Loading the testing file ..." testset = np.load(path) # Convert testing set into CRF Feature Format featset = pycrfsuite.ItemSequence(testset[:, 0]) ref = [str(label) for label in testset[:, 1]] # Loading the model print "Loading the CRF model..." tagger = pycrfsuite.Tagger() tagger.open(model) # Testing progress #sys.stdout.write("Testing: ") #sys.stdout.flush() #pred = [] #idx = 0 #for i in featset.items(): # idx += 1 # if idx % 1000 == 0:
def test_basic(): seq = pycrfsuite.ItemSequence([]) assert len(seq) == 0 assert seq.items() == []
train_sents = LoadFile.load_crf_data(train_data_path) test_sents = LoadFile.load_crf_data(test_data_path) X_train = [sent2features(s) for s in train_sents] y_train = [sent2labels(s) for s in train_sents] X_test = [sent2features(s) for s in test_sents] y_test = [sent2labels(s) for s in test_sents] trainer = pycrfsuite.Trainer(verbose=False) for xseq, yseq in zip(X_train, y_train): itemseq = pycrfsuite.ItemSequence(xseq) trainer.append(itemseq, yseq) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) trainer.train('conll2002-esp.crfsuite') print len(trainer.logparser.iterations), trainer.logparser.iterations[-1] tagger = pycrfsuite.Tagger()
idx = 0 X = [] Y = [] for sen in txt: for p in sen.getPredicates(): if p.pos.startswith("V"): sq_dt = [] sq_lbl = [] for arg in sen: if arg in p.arguments.keys(): sq_dt.append(ds[idx][0].getIndices2(fg)) sq_lbl.append(ds[idx][1]) idx += 1 iq = pycrfsuite.ItemSequence(sq_dt) X.append(iq) Y.append(sq_lbl) print("start training...") trainer = pycrfsuite.Trainer(verbose=False) for xseq, yseq in zip(X, Y): trainer.append(xseq, yseq) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True })
labels = list(crf.classes_) #labels.remove('O') y_pred = crf.predict(X_test) #print(cross_val_score(crf,X_train,y_train,cv=5,scoring='f1_micro')) print(metrics.flat_f1_score(y_test, y_pred, average='weighted')) sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) print( metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3)) clf = svm.SVC(kernel='linear') temp = numpy.array(DictVectorizer(sparse=False).fit_transform(X_train[0])) X = list() to_vector = pycrfsuite.ItemSequence(X_train[0]).items() for t in X_train: wordtemp = pycrfsuite.ItemSequence(t).items() for z in wordtemp: to_vector.append(z) vectorizer = DictVectorizer() X = vectorizer.fit_transform(to_vector) Y1 = list() for t in y_train[0]: Y1.append(t) for t in y_train: #wordtemp=pycrfsuite.ItemSequence(t).items() for z in t: Y1.append(z) #Y=FeatureHasher().fit_transform(Y1) clf.fit(X, Y1)
def test(raw_string='ONS LIMITED FLAT 1 12 OXFORD STREET STREET ST1 2FW', verbose=False): """ A simple test to check that the calling mechanism from Python gives the same results as if CRFsuite were called directly from the command line. Requires a compiled version of the CRFsuite. :param raw_string: input string to test :type raw_string: str :param verbose: additional debugging output :type verbose: bool :return: None """ print('Input string:', raw_string) print('Python Results:', tag(raw_string)) tokens = tok.tokenize(raw_string) features = tok.tokens2features(tokens) if verbose: print('features:', features) tags = TAGGER.tag(features) print('Inferred tags:', tags) print('Probability of the sequence:', round(TAGGER.probability(tags), 6)) assert round(TAGGER.probability(tags), 6) == 0.992256, 'Sequence probability not correct' results = [ 0.999999, 0.999999, 0.999846, 0.993642, 0.999728, 1., 1., 0.998874, 1., 1. ] for i, tg in enumerate(tags): prob = round(TAGGER.marginal(tg, i), 6) print('Marginal probability of', tg, 'in position', i, 'is', prob) assert prob == results[ i], 'Marginal Probability of a Label not correct' if verbose: print(TAGGER.info().transitions) print(TAGGER.info().state_features) print(TAGGER.info().attributes) # store the ItemSequence temporarily tmp = pycrfsuite.ItemSequence(features) # write to a text file fh = open('training/test.txt', 'w') for i, tg in enumerate(tags): fh.write(tg + '\t') items = tmp.items()[i] for item in sorted(items): itemtext = str(item) fh.write( itemtext.replace(':', '\:') + ':' + str(items[item]) + '\t') fh.write('\n') fh.close() # command line call to the C code to test the output print('\nCRFsuite call results:') os.system( 'crfsuite tag -pit -m training/addressCRF.crfsuite training/test.txt')
iter_num = 15 sample_num = 300 precision_list = list() for c in range(0, iter_num): print(c) #%%time trainer = pycrfsuite.Trainer(verbose=False) #for srcid, setence in sentenceDict.items(): randomIdxList = random.sample(range(0, len(labelListDict)), sample_num) for i, (srcid, labels) in enumerate(labelListDict.items()): if i not in randomIdxList: continue sentence = sentenceDict[srcid] #trainer.append(pycrfsuite.ItemSequence(calc_features(sentence, labels)), labels) trainer.append(pycrfsuite.ItemSequence(calc_features(sentence)), labels) # In[6]: #%%time trainer.train('random.crfsuite') # In[7]: tagger = pycrfsuite.Tagger() tagger.open('random.crfsuite') # In[8]: #%%time
def sent2features(sent, features): return pycrfsuite.ItemSequence( [word2features(sent, i, features) for i in range(len(sent))])
def itemsequence(self): return pycrfsuite.ItemSequence(self.features)
def to_item_sequence(x, feat): return pycrf.ItemSequence([feat(x, i) for i in range(0, len(x))])
def test_lists(): seq = pycrfsuite.ItemSequence([['foo', 'bar'], ['bar', 'baz']]) assert len(seq) == 2 assert seq.items() == [{'foo': 1.0, 'bar': 1.0}, {'bar': 1.0, 'baz': 1.0}] assert pycrfsuite.ItemSequence(seq.items()).items() == seq.items()
def sent2features(d, sent, h_input): return pycrfsuite.ItemSequence( [word2features(d, sent, h_input, i) for i in range(len(sent))])
sys.exit(2) # Check the training file exists or not? if not os.path.exists(path): print "The training file \'%s\' does not exists. Try again!" % path sys.exit(1) elif not os.path.isfile(path): print "The training file \'%s\' is not a file. Try again!" % path sys.exit(1) # Loading training data print "Loading the training data..." trainset = np.load(path) # Convert training data to CRF Feature Format featset = pycrfsuite.ItemSequence(trainset[:, 0]) labelset = trainset[:, 1] # Create a trainer trainer = pycrfsuite.Trainer() # Feeding training data to Trainer trainer.append(featset, labelset) # Set up some parameters of Trainer trainer.set_params({'c1': 1.0,\ 'c2': 1e-3,\ 'max_iterations': 50,\ 'feature.possible_transitions': True}) # Show parameters of Trainer