def predictparagraph(self): """ Step1: Split the WorkExpSection . Step2: Token each line and partition the data in proportion 1:0 Step3: Predict each line by applying the CRF Model on the list of tokens . Step4: Get the different information from the predicted result. """ sentences = self.workexpsection.split("\n") sentences = filter(lambda x: x.strip() != '', sentences) #Get the list of instances. [instances,nothing] = partition(map(self.tokenize,sentences),[1.0,0.0]) predictedresult = map(self.predict,instances) self.getInfo(instances,predictedresult)
def run_test(): [train, _] = partition(get_data('data/tagged_references.txt'), [0.01, 0.0]) (L, A) = build_domain(train) crf = StringCRF(L, A) print('Testing gradient of log-likelihood....') crf.preprocess(train) crf.W[:] = np.random.uniform(-1, 1, size=crf.W.shape) test_gradient(crf, train) # Chekc that we have enough features to overfit this small training set. crf.sgd(train, iterations=10) llh = sum(map(crf.likelihood, iterview(train, msg='llh'))) / len(train) print(f'log-likelihood {llh:g}') _, _, _, _, f = zip(*f1(train, 'train', crf)) overall = 100 * np.mean(f) # equally weighted average F1 print(f'Overall F1 (train): {overall:.2f}')
def run(proportion=None, iterations=20): [train, _] = partition(get_data('data/tagged_references.txt'), proportion) def validate(model, _): llh = sum(map(crf.likelihood, iterview(train, msg='llh'))) / len(train) _, _, _, _, f = zip(*f1(train, 'train', model)) overall = 100 * np.mean(f) # equally weighted average F1 print() print(f'log-likelihood: {llh:g}') print(f'F1 overall: {overall:.2f}') print() # Create and train CRF (L, A) = build_domain(train) crf = StringCRF(L, A) print(len(L), 'labels') print(len(A), 'features') fit = [crf.sgd, crf.perceptron, crf.very_sgd][0] fit(train, iterations=iterations, validate=validate)
def main(proportion=None, iterations=20, save='model.pkl~', load=None): class Token(object): def __init__(self, form): self.form = form self.attributes = [] def add(self, features): """ Add features to this Token. """ self.attributes.extend(features) def token_features(tk): """ very basic feature extraction. """ w = tk.form yield 'word=' + w yield 'simplified=' + re.sub('[0-9]', '0', re.sub('[^a-zA-Z0-9()\.\,]', '', w.lower())) for c in re.findall('[^a-zA-Z0-9]', w): # non-alpha-numeric yield 'contains(%r)' % c def preprocessing(s): """ Run instance thru feature extraction. """ s[0].add(['first-token']) s[-1].add(['last-token']) for tk in s: tk.add(token_features(tk)) if 1: # previous token features for t in xrange(1, len(s)): s[t].add(f + '@-1' for f in token_features(s[t-1])) # next token features for t in xrange(len(s) - 1): s[t].add(f + '@+1' for f in token_features(s[t+1])) return s def get_data(f): for x in fromSGML(f, linegrouper="<NEW.*?>", bioencoding=False): x, y = zip(*[(Token(w), y) for y, w in x]) preprocessing(x) yield Instance(x, truth=y) [train, test] = partition(get_data('tagged_references.txt'), proportion) def validate(model, iteration=None): def f1(data, name): print print 'Phrase-based F1:', name f1 = F1() for i, x in enumerate(iterview(data)): predict = extract_contiguous(model(x)) truth = extract_contiguous(x.truth) # (i,begin,end) uniquely identifies the span for (label, begins, ends) in truth: f1.add_relevant(label, (i, begins, ends)) for (label, begins, ends) in predict: f1.add_retrieved(label, (i, begins, ends)) print return f1.scores(verbose=True) def weight_sparsity(W, t=0.0001): a = (np.abs(W) > t).sum() b = W.size print '%.2f (%s/%s) sparsity' % (a*100.0/b, a, b) # f1(train, name='TRAIN') # if test: # f1(test, name='TEST') # print # weight_sparsity(model.W) llh = sum(map(crf.likelihood, iterview(train, msg='llh'))) / len(train) from arsenal.viz.util import lineplot with lineplot('llh') as d: d.append(llh) print print 'likelihood:', llh print print if load: crf = StringCRF.load(load) validate(crf) return # Create and train CRF (L, A) = build_domain(train) print len(L), 'labels' print len(A), 'features' crf = StringCRF(L, A) if 0: print 'testing gradient....' crf.preprocess(train) crf.W[:] = np.random.uniform(-1,1,size=crf.W.shape) crf.test_gradient(train[:10]) print 'testing....done' fit = [crf.sgd, crf.perceptron, crf.very_sgd][2] fit(train, iterations=iterations, validate=validate) if save: crf.save(save)
def main(proportion=None, iterations=20, save="model.pkl~", load=None): class Token(object): def __init__(self, form): self.form = form self.attributes = [] def add(self, features): """ Add features to this Token. """ self.attributes.extend(features) def token_features(tk): """ very basic feature extraction. """ w = tk.form yield "word=" + w yield "simplified=" + re.sub("[0-9]", "0", re.sub("[^a-zA-Z0-9()\.\,]", "", w.lower())) for c in re.findall("[^a-zA-Z0-9]", w): # non-alpha-numeric yield "contains(%r)" % c def preprocessing(s): """ Run instance thru feature extraction. """ s[0].add(["first-token"]) s[-1].add(["last-token"]) for tk in s: tk.add(token_features(tk)) if 1: # previous token features for t in xrange(1, len(s)): s[t].add(f + "@-1" for f in token_features(s[t - 1])) # next token features for t in xrange(len(s) - 1): s[t].add(f + "@+1" for f in token_features(s[t + 1])) return s def get_data(f): for x in fromSGML(f, linegrouper="<NEW.*?>", bioencoding=False): x, y = zip(*[(Token(w), y) for y, w in x]) preprocessing(x) yield Instance(x, truth=y) [train, test] = partition(get_data("tagged_references.txt"), proportion) def validate(model, iteration=None): def f1(data, name): print print "Phrase-based F1:", name f1 = F1() for i, x in enumerate(iterview(data)): predict = extract_contiguous(model(x)) truth = extract_contiguous(x.truth) # (i,begin,end) uniquely identifies the span for (label, begins, ends) in truth: f1.add_relevant(label, (i, begins, ends)) for (label, begins, ends) in predict: f1.add_retrieved(label, (i, begins, ends)) print return f1.scores(verbose=True) def weight_sparsity(W, t=0.0001): a = (np.abs(W) > t).sum() b = W.size print "%.2f (%s/%s) sparsity" % (a * 100.0 / b, a, b) f1(train, name="TRAIN") f1(test, name="TEST") print weight_sparsity(model.W) print print "likelihood:", sum(map(crf.likelihood, iterview(train))) / len(train) print print if load: crf = StringCRF.load(load) validate(crf) return # Create and train CRF (L, A) = build_domain(train) print len(L), "labels" print len(A), "features" crf = StringCRF(L, A) fit = [crf.sgd, crf.perceptron][0] fit(train, iterations=iterations, validate=validate) if save: crf.save(save)