示例#1
0
文件: model.py 项目: yofayed/CliNER
    def __first_predict(self, data):
        """
        Model::__first_predict()

        Purpose: Predict IOB chunks on data

        @param data.  A list of split sentences    (1 sent = 1 line from file)
        @return       A list of list of IOB labels (1:1 mapping with data)
        """

        if globals_cliner.verbosity > 0:
            print '\textracting  features (pass one)'

        # Seperate into
        nested_prose_data = filter(lambda line: is_prose_sentence(line), data)
        nested_nonprose_data = filter(lambda line: not is_prose_sentence(line),
                                      data)

        # Parition into prose v. nonprose
        nested_prose_feats = feat_obj.IOB_prose_features(nested_prose_data)
        nested_nonprose_feats = feat_obj.IOB_nonprose_features(
            nested_nonprose_data)

        # rename because code uses it
        prose = nested_prose_feats
        nonprose = nested_nonprose_feats

        # Predict labels for IOB prose and nonprose text
        nlist = self.__generic_first_predict('nonprose', nonprose,
                                             self._first_nonprose_vec,
                                             self._first_nonprose_clf)
        plist = self.__generic_first_predict('prose', prose,
                                             self._first_prose_vec,
                                             self._first_prose_clf)

        # Stitch prose and nonprose data back together
        # translate IOB labels into a readable format
        prose_iobs = []
        nonprose_iobs = []
        iobs = []
        num2iob = lambda l: reverse_IOB_labels[int(l)]
        for sentence in data:
            if sentence == []:
                iobs.append([])
            elif is_prose_sentence(sentence):
                prose_iobs.append(plist.pop(0))
                prose_iobs[-1] = map(num2iob, prose_iobs[-1])
                iobs.append(prose_iobs[-1])
            else:
                nonprose_iobs.append(nlist.pop(0))
                nonprose_iobs[-1] = map(num2iob, nonprose_iobs[-1])
                iobs.append(nonprose_iobs[-1])

        # list of list of IOB labels
        return iobs
示例#2
0
文件: model.py 项目: yofayed/CliNER
    def __first_train(self, tokenized_sentences, Y, do_grid=False):
        """
        Model::__first_train()

        Purpose: Train the first pass classifiers (for IOB chunking)

        @param tokenized_sentences. <list> of tokenized sentences
        @param Y.                   <list-of-lists> of IOB labels for words
        @param do_grid.             <boolean> whether to perform a grid search

        @return          None
        """

        if globals_cliner.verbosity > 0: print 'first pass'
        if globals_cliner.verbosity > 0:
            print '\textracting  features (pass one)'

        # Seperate into prose v nonprose
        nested_prose_data, nested_prose_Y = zip(
            *filter(lambda line_iob_tup: is_prose_sentence(line_iob_tup[0]),
                    zip(tokenized_sentences, Y)))
        nested_nonprose_data, nested_nonprose_Y = zip(*filter(
            lambda line_iob_tup: not is_prose_sentence(line_iob_tup[0]),
            zip(tokenized_sentences, Y)))

        #extract features
        nested_prose_feats = feat_obj.IOB_prose_features(nested_prose_data)
        nested_nonprose_feats = feat_obj.IOB_nonprose_features(
            nested_nonprose_data)

        # Flatten lists (because classifier will expect flat)
        prose_Y = flatten(nested_prose_Y)
        nonprose_Y = flatten(nested_nonprose_Y)

        # rename because code uses it
        pchunks = prose_Y
        nchunks = nonprose_Y
        prose = nested_prose_feats
        nonprose = nested_nonprose_feats

        # Train classifiers for prose and nonprose
        pvec, pclf = self.__generic_first_train('prose', prose, pchunks,
                                                do_grid)
        nvec, nclf = self.__generic_first_train('nonprose', nonprose, nchunks,
                                                do_grid)

        # Save vectorizers
        self._first_prose_vec = pvec
        self._first_nonprose_vec = nvec

        # Save classifiers
        self._first_prose_clf = pclf
        self._first_nonprose_clf = nclf
示例#3
0
    def __init__(self, tagger, data):
        """
        Constructor.

        @param data. A list of split sentences
        """
        # Filter out nonprose sentences
        prose = [sent for sent in data if utilities.is_prose_sentence(sent)]

        # Process prose sentences with GENIA tagger
        self.GENIA_features = iter(interface_genia.genia(tagger, prose))
示例#4
0
    def __init__(self, tagger, data):

        """
        Constructor.

        @param data. A list of split sentences
        """

        # Filter out nonprose sentences
        prose = [ sent  for  sent  in  data  if  utilities.is_prose_sentence(sent) ]

        # Process prose sentences with GENIA tagger
        self.GENIA_features = iter(interface_genia.genia(tagger, prose))
示例#5
0
    def first_predict(self, data):
        """
        Model::first_predict()

        Purpose: Predict IOB chunks on data

        @param data.  A list of split sentences    (1 sent = 1 line from file)
        @return       A list of list of IOB labels (1:1 mapping with data)
        """

        print '\textracting  features (pass one)'

        # Create object that is a wrapper for the features
        feat_obj = FeatureWrapper(data)

        # separate prose and nonprose data
        prose = []
        nonprose = []
        plinenos = []
        nlinenos = []
        for i, line in enumerate(data):
            isProse, feats = feat_obj.extract_IOB_features(line)
            if isProse:
                prose.append(feats)
                plinenos.append(i)
            else:
                nonprose.append(feats)
                nlinenos.append(i)

        # Classify both prose & nonprose
        flabels = ['prose', 'nonprose']
        fsets = [prose, nonprose]
        dvects = [self.first_prose_vec, self.first_nonprose_vec]
        clfs = [self.first_prose_clf, self.first_nonprose_clf]
        preds = []

        for flabel, fset, dvect, clf in zip(flabels, fsets, dvects, clfs):

            # If nothing to predict, skip actual prediction
            if len(fset) == 0:
                preds.append([])
                continue

            print '\tvectorizing features (pass one) ' + flabel

            # Save list structure to reconstruct after vectorization
            offsets = [len(sublist) for sublist in fset]
            for i in range(1, len(offsets)):
                offsets[i] += offsets[i - 1]

            # Vectorize features
            flattened = [item for sublist in fset for item in sublist]
            X = dvect.transform(flattened)

            print '\tpredicting    labels (pass one) ' + flabel

            # CRF requires reconstruct lists
            if self.crf_enabled:
                X = list(X)
                X = [X[i:j] for i, j in zip([0] + offsets, offsets)]
                lib = crf
            else:
                lib = sci

            # Predict IOB labels
            out = lib.predict(clf, X)

            # Format labels from output
            pred = [out[i:j] for i, j in zip([0] + offsets, offsets)]
            preds.append(pred)

        # Recover predictions
        plist = preds[0]
        nlist = preds[1]

        # Stitch prose and nonprose data back together
        # translate IOB labels into a readable format
        prose_iobs = []
        nonprose_iobs = []
        iobs = []
        trans = lambda l: reverse_IOB_labels[int(l)]
        for sentence in data:
            if is_prose_sentence(sentence):
                prose_iobs.append(plist.pop(0))
                prose_iobs[-1] = map(trans, prose_iobs[-1])
                iobs.append(prose_iobs[-1])
            else:
                nonprose_iobs.append(nlist.pop(0))
                nonprose_iobs[-1] = map(trans, nonprose_iobs[-1])
                iobs.append(nonprose_iobs[-1])

        # list of list of IOB labels
        return iobs, prose_iobs, nonprose_iobs
示例#6
0
    def first_predict(self, data):

        """
        Model::first_predict()

        Purpose: Predict IOB chunks on data

        @param data.  A list of split sentences    (1 sent = 1 line from file)
        @return       A list of list of IOB labels (1:1 mapping with data)
        """

        print '\textracting  features (pass one)'


        # Create object that is a wrapper for the features
        feat_obj = FeatureWrapper(data)

        # separate prose and nonprose data
        prose    = []
        nonprose = []
        plinenos = []
        nlinenos = []
        for i,line in enumerate(data):
            isProse,feats = feat_obj.extract_IOB_features(line)
            if isProse:
                prose.append(feats)
                plinenos.append(i)
            else:
                nonprose.append(feats)
                nlinenos.append(i)


        # Classify both prose & nonprose
        flabels = ['prose'             , 'nonprose'             ]
        fsets   = [prose               , nonprose               ]
        dvects  = [self.first_prose_vec, self.first_nonprose_vec]
        clfs    = [self.first_prose_clf, self.first_nonprose_clf]
        preds   = []

        for flabel,fset,dvect,clf in zip(flabels, fsets, dvects, clfs):

            # If nothing to predict, skip actual prediction
            if len(fset) == 0:
                preds.append([])
                continue


            print '\tvectorizing features (pass one) ' + flabel

            # Save list structure to reconstruct after vectorization
            offsets = [ len(sublist) for sublist in fset ]
            for i in range(1, len(offsets)):
                offsets[i] += offsets[i-1]

            # Vectorize features
            flattened = [item for sublist in fset for item in sublist]
            X = dvect.transform(flattened)


            print '\tpredicting    labels (pass one) ' + flabel

            # CRF requires reconstruct lists
            if self.crf_enabled:
                X = list(X)
                X = [ X[i:j] for i, j in zip([0] + offsets, offsets)]
                lib = crf
            else:
                lib = sci

            # Predict IOB labels
            out = lib.predict(clf, X)

            # Format labels from output
            pred = [out[i:j] for i, j in zip([0] + offsets, offsets)]
            preds.append(pred)


        # Recover predictions
        plist = preds[0]
        nlist = preds[1]


        # Stitch prose and nonprose data back together
        # translate IOB labels into a readable format
        prose_iobs    = []
        nonprose_iobs = []
        iobs          = []
        trans = lambda l: reverse_IOB_labels[int(l)]
        for sentence in data:
            if is_prose_sentence(sentence):
                prose_iobs.append( plist.pop(0) )
                prose_iobs[-1] = map(trans, prose_iobs[-1])
                iobs.append( prose_iobs[-1] )
            else:
                nonprose_iobs.append( nlist.pop(0) )
                nonprose_iobs[-1] = map(trans, nonprose_iobs[-1])
                iobs.append( nonprose_iobs[-1] )


        # list of list of IOB labels
        return iobs, prose_iobs, nonprose_iobs