def __first_predict(self, data): """ Model::__first_predict() Purpose: Predict IOB chunks on data @param data. A list of split sentences (1 sent = 1 line from file) @return A list of list of IOB labels (1:1 mapping with data) """ if globals_cliner.verbosity > 0: print '\textracting features (pass one)' # Seperate into nested_prose_data = filter(lambda line: is_prose_sentence(line), data) nested_nonprose_data = filter(lambda line: not is_prose_sentence(line), data) # Parition into prose v. nonprose nested_prose_feats = feat_obj.IOB_prose_features(nested_prose_data) nested_nonprose_feats = feat_obj.IOB_nonprose_features( nested_nonprose_data) # rename because code uses it prose = nested_prose_feats nonprose = nested_nonprose_feats # Predict labels for IOB prose and nonprose text nlist = self.__generic_first_predict('nonprose', nonprose, self._first_nonprose_vec, self._first_nonprose_clf) plist = self.__generic_first_predict('prose', prose, self._first_prose_vec, self._first_prose_clf) # Stitch prose and nonprose data back together # translate IOB labels into a readable format prose_iobs = [] nonprose_iobs = [] iobs = [] num2iob = lambda l: reverse_IOB_labels[int(l)] for sentence in data: if sentence == []: iobs.append([]) elif is_prose_sentence(sentence): prose_iobs.append(plist.pop(0)) prose_iobs[-1] = map(num2iob, prose_iobs[-1]) iobs.append(prose_iobs[-1]) else: nonprose_iobs.append(nlist.pop(0)) nonprose_iobs[-1] = map(num2iob, nonprose_iobs[-1]) iobs.append(nonprose_iobs[-1]) # list of list of IOB labels return iobs
def __first_train(self, tokenized_sentences, Y, do_grid=False): """ Model::__first_train() Purpose: Train the first pass classifiers (for IOB chunking) @param tokenized_sentences. <list> of tokenized sentences @param Y. <list-of-lists> of IOB labels for words @param do_grid. <boolean> whether to perform a grid search @return None """ if globals_cliner.verbosity > 0: print 'first pass' if globals_cliner.verbosity > 0: print '\textracting features (pass one)' # Seperate into prose v nonprose nested_prose_data, nested_prose_Y = zip( *filter(lambda line_iob_tup: is_prose_sentence(line_iob_tup[0]), zip(tokenized_sentences, Y))) nested_nonprose_data, nested_nonprose_Y = zip(*filter( lambda line_iob_tup: not is_prose_sentence(line_iob_tup[0]), zip(tokenized_sentences, Y))) #extract features nested_prose_feats = feat_obj.IOB_prose_features(nested_prose_data) nested_nonprose_feats = feat_obj.IOB_nonprose_features( nested_nonprose_data) # Flatten lists (because classifier will expect flat) prose_Y = flatten(nested_prose_Y) nonprose_Y = flatten(nested_nonprose_Y) # rename because code uses it pchunks = prose_Y nchunks = nonprose_Y prose = nested_prose_feats nonprose = nested_nonprose_feats # Train classifiers for prose and nonprose pvec, pclf = self.__generic_first_train('prose', prose, pchunks, do_grid) nvec, nclf = self.__generic_first_train('nonprose', nonprose, nchunks, do_grid) # Save vectorizers self._first_prose_vec = pvec self._first_nonprose_vec = nvec # Save classifiers self._first_prose_clf = pclf self._first_nonprose_clf = nclf
def __init__(self, tagger, data): """ Constructor. @param data. A list of split sentences """ # Filter out nonprose sentences prose = [sent for sent in data if utilities.is_prose_sentence(sent)] # Process prose sentences with GENIA tagger self.GENIA_features = iter(interface_genia.genia(tagger, prose))
def __init__(self, tagger, data): """ Constructor. @param data. A list of split sentences """ # Filter out nonprose sentences prose = [ sent for sent in data if utilities.is_prose_sentence(sent) ] # Process prose sentences with GENIA tagger self.GENIA_features = iter(interface_genia.genia(tagger, prose))
def first_predict(self, data): """ Model::first_predict() Purpose: Predict IOB chunks on data @param data. A list of split sentences (1 sent = 1 line from file) @return A list of list of IOB labels (1:1 mapping with data) """ print '\textracting features (pass one)' # Create object that is a wrapper for the features feat_obj = FeatureWrapper(data) # separate prose and nonprose data prose = [] nonprose = [] plinenos = [] nlinenos = [] for i, line in enumerate(data): isProse, feats = feat_obj.extract_IOB_features(line) if isProse: prose.append(feats) plinenos.append(i) else: nonprose.append(feats) nlinenos.append(i) # Classify both prose & nonprose flabels = ['prose', 'nonprose'] fsets = [prose, nonprose] dvects = [self.first_prose_vec, self.first_nonprose_vec] clfs = [self.first_prose_clf, self.first_nonprose_clf] preds = [] for flabel, fset, dvect, clf in zip(flabels, fsets, dvects, clfs): # If nothing to predict, skip actual prediction if len(fset) == 0: preds.append([]) continue print '\tvectorizing features (pass one) ' + flabel # Save list structure to reconstruct after vectorization offsets = [len(sublist) for sublist in fset] for i in range(1, len(offsets)): offsets[i] += offsets[i - 1] # Vectorize features flattened = [item for sublist in fset for item in sublist] X = dvect.transform(flattened) print '\tpredicting labels (pass one) ' + flabel # CRF requires reconstruct lists if self.crf_enabled: X = list(X) X = [X[i:j] for i, j in zip([0] + offsets, offsets)] lib = crf else: lib = sci # Predict IOB labels out = lib.predict(clf, X) # Format labels from output pred = [out[i:j] for i, j in zip([0] + offsets, offsets)] preds.append(pred) # Recover predictions plist = preds[0] nlist = preds[1] # Stitch prose and nonprose data back together # translate IOB labels into a readable format prose_iobs = [] nonprose_iobs = [] iobs = [] trans = lambda l: reverse_IOB_labels[int(l)] for sentence in data: if is_prose_sentence(sentence): prose_iobs.append(plist.pop(0)) prose_iobs[-1] = map(trans, prose_iobs[-1]) iobs.append(prose_iobs[-1]) else: nonprose_iobs.append(nlist.pop(0)) nonprose_iobs[-1] = map(trans, nonprose_iobs[-1]) iobs.append(nonprose_iobs[-1]) # list of list of IOB labels return iobs, prose_iobs, nonprose_iobs
def first_predict(self, data): """ Model::first_predict() Purpose: Predict IOB chunks on data @param data. A list of split sentences (1 sent = 1 line from file) @return A list of list of IOB labels (1:1 mapping with data) """ print '\textracting features (pass one)' # Create object that is a wrapper for the features feat_obj = FeatureWrapper(data) # separate prose and nonprose data prose = [] nonprose = [] plinenos = [] nlinenos = [] for i,line in enumerate(data): isProse,feats = feat_obj.extract_IOB_features(line) if isProse: prose.append(feats) plinenos.append(i) else: nonprose.append(feats) nlinenos.append(i) # Classify both prose & nonprose flabels = ['prose' , 'nonprose' ] fsets = [prose , nonprose ] dvects = [self.first_prose_vec, self.first_nonprose_vec] clfs = [self.first_prose_clf, self.first_nonprose_clf] preds = [] for flabel,fset,dvect,clf in zip(flabels, fsets, dvects, clfs): # If nothing to predict, skip actual prediction if len(fset) == 0: preds.append([]) continue print '\tvectorizing features (pass one) ' + flabel # Save list structure to reconstruct after vectorization offsets = [ len(sublist) for sublist in fset ] for i in range(1, len(offsets)): offsets[i] += offsets[i-1] # Vectorize features flattened = [item for sublist in fset for item in sublist] X = dvect.transform(flattened) print '\tpredicting labels (pass one) ' + flabel # CRF requires reconstruct lists if self.crf_enabled: X = list(X) X = [ X[i:j] for i, j in zip([0] + offsets, offsets)] lib = crf else: lib = sci # Predict IOB labels out = lib.predict(clf, X) # Format labels from output pred = [out[i:j] for i, j in zip([0] + offsets, offsets)] preds.append(pred) # Recover predictions plist = preds[0] nlist = preds[1] # Stitch prose and nonprose data back together # translate IOB labels into a readable format prose_iobs = [] nonprose_iobs = [] iobs = [] trans = lambda l: reverse_IOB_labels[int(l)] for sentence in data: if is_prose_sentence(sentence): prose_iobs.append( plist.pop(0) ) prose_iobs[-1] = map(trans, prose_iobs[-1]) iobs.append( prose_iobs[-1] ) else: nonprose_iobs.append( nlist.pop(0) ) nonprose_iobs[-1] = map(trans, nonprose_iobs[-1]) iobs.append( nonprose_iobs[-1] ) # list of list of IOB labels return iobs, prose_iobs, nonprose_iobs