예제 #1
    def second_predict(self, data, inds_list):

        # If first pass predicted no concepts, then skip
        # NOTE: Special case because SVM cannot have empty input
        if sum([len(inds) for inds in inds_list]) == 0:
            print "first pass predicted no concepts, skipping second pass"
            return []

        # Create object that is a wrapper for the features
        feat_o = FeatureWrapper()

        print '\textracting  features (pass two)'

        # Extract features
        X = [
            feat_o.concept_features(s, inds)
            for s, inds in zip(data, inds_list)
        X = reduce(concat, X)

        print '\tvectorizing features (pass two)'

        # Vectorize features
        X = self.second_vec.transform(X)

        print '\tpredicting    labels (pass two)'

        # Predict concept labels
        out = sci.predict(self.second_clf, X)

        # Line-by-line processing
        o = list(out)
        classifications = []
        for lineno, inds in enumerate(inds_list):

            # Skip empty line
            if not inds: continue

            # For each concept
            for ind in inds:

                # Get next concept
                concept = reverse_concept_labels[o.pop(0)]

                # Get start position (ex. 7th word of line)
                start = 0
                for i in range(ind):
                    start += len(data[lineno][i].split())

                # Length of chunk
                length = len(data[lineno][ind].split())

                # Classification token
                    (concept, lineno + 1, start, start + length - 1))

        # Return classifications
        return classifications
예제 #2
파일: model.py 프로젝트: aussina/CliNER
    def second_train(self, data, inds_list, Y, do_grid=False):


        Purpose: Train the first pass classifiers (for IOB chunking)

        @param data      A list of list of strings.
                           - A string is a chunked phrase
                           - An inner list corresponds to one line from the file
        @param inds_list A list of list of integer indices
                           - assertion: len(data) == len(inds_list)
                           - one line of 'inds_list' contains a list of indices
                               into the corresponding line for 'data'
        @param Y         A list of concept labels
                           - assertion: there are sum(len(inds_list)) labels
                               AKA each index from inds_list maps to a label
        @param do_grid   A boolean indicating whether to perform a grid search

        @return          None

        print '\textracting  features (pass two)'

        # Create object that is a wrapper for the features
        feat_o = FeatureWrapper()

        # Extract features
        X = [ feat_o.concept_features(s,inds) for s,inds in zip(data,inds_list) ]
        X = reduce(concat, X)

        print '\tvectorizing features (pass two)'

        # Vectorize labels
        Y = [  concept_labels[y]  for  y  in  Y  ]

        # Vectorize features
        X = self.second_vec.fit_transform(X)

        print '\ttraining  classifier (pass two)'

        # Train the model
        self.second_clf = sci.train(X, Y, do_grid)
예제 #3
    def second_train(self, data, inds_list, Y, do_grid=False):

        Purpose: Train the first pass classifiers (for IOB chunking)

        @param data      A list of list of strings.
                           - A string is a chunked phrase
                           - An inner list corresponds to one line from the file
        @param inds_list A list of list of integer indices
                           - assertion: len(data) == len(inds_list)
                           - one line of 'inds_list' contains a list of indices
                               into the corresponding line for 'data'
        @param Y         A list of concept labels
                           - assertion: there are sum(len(inds_list)) labels
                               AKA each index from inds_list maps to a label
        @param do_grid   A boolean indicating whether to perform a grid search

        @return          None

        print '\textracting  features (pass two)'

        # Create object that is a wrapper for the features
        feat_o = FeatureWrapper()

        # Extract features
        X = [
            feat_o.concept_features(s, inds)
            for s, inds in zip(data, inds_list)
        X = reduce(concat, X)

        print '\tvectorizing features (pass two)'

        # Vectorize labels
        Y = [concept_labels[y] for y in Y]

        # Vectorize features
        X = self.second_vec.fit_transform(X)

        print '\ttraining  classifier (pass two)'

        # Train the model
        self.second_clf = sci.train(X, Y, do_grid)
예제 #4
    def first_train(self, data, Y, do_grid=False):

        Purpose: Train the first pass classifiers (for IOB chunking)

        @param data      A list of split sentences    (1 sent = 1 line from file)
        @param Y         A list of list of IOB labels (1:1 mapping with data)
        @param do_grid   A boolean indicating whether to perform a grid search

        @return          None

        print '\textracting  features (pass one)'

        # Create object that is a wrapper for the features
        feat_obj = FeatureWrapper(data)

        # Parition into prose v. nonprose
        prose = []
        nonprose = []
        pchunks = []
        nchunks = []
        for line, labels in zip(data, Y):
            isProse, feats = feat_obj.extract_IOB_features(line)
            if isProse:
                pchunks += labels
                nchunks += labels

        # Classify both prose & nonprose
        flabels = ['prose', 'nonprose']
        fsets = [prose, nonprose]
        chunksets = [pchunks, nchunks]
        dvects = [self.first_prose_vec, self.first_nonprose_vec]
        clfs = [self.first_prose_clf, self.first_nonprose_clf]

        vectorizers = []
        classifiers = []

        for flabel, fset, chunks, dvect, clf in zip(flabels, fsets, chunksets,
                                                    dvects, clfs):

            if len(fset) == 0:
                raise Exception(
                    'Training data must have %s training examples' % flabel)

            print '\tvectorizing features (pass one) ' + flabel

            # Vectorize IOB labels
            Y = [IOB_labels[y] for y in chunks]

            # Save list structure to reconstruct after vectorization
            offsets = [len(sublist) for sublist in fset]
            for i in range(1, len(offsets)):
                offsets[i] += offsets[i - 1]

            # Vectorize features
            flattened = [item for sublist in fset for item in sublist]
            X = dvect.fit_transform(flattened)

            print '\ttraining classifiers (pass one) ' + flabel

            # CRF needs reconstructed lists
            if self.crf_enabled:
                X = list(X)
                X = [X[i:j] for i, j in zip([0] + offsets, offsets)]
                Y = [Y[i:j] for i, j in zip([0] + offsets, offsets)]
                lib = crf
                lib = sci

            # Train classifiers
            clf = lib.train(X, Y, do_grid)

        # Save vectorizers
        self.first_prose_vec = vectorizers[0]
        self.first_nonprose_vec = vectorizers[1]

        # Save classifiers
        self.first_prose_clf = classifiers[0]
        self.first_nonprose_clf = classifiers[1]
예제 #5
    def first_predict(self, data):

        Purpose: Predict IOB chunks on data

        @param data.  A list of split sentences    (1 sent = 1 line from file)
        @return       A list of list of IOB labels (1:1 mapping with data)

        print '\textracting  features (pass one)'

        # Create object that is a wrapper for the features
        feat_obj = FeatureWrapper(data)

        # separate prose and nonprose data
        prose = []
        nonprose = []
        plinenos = []
        nlinenos = []
        for i, line in enumerate(data):
            isProse, feats = feat_obj.extract_IOB_features(line)
            if isProse:

        # Classify both prose & nonprose
        flabels = ['prose', 'nonprose']
        fsets = [prose, nonprose]
        dvects = [self.first_prose_vec, self.first_nonprose_vec]
        clfs = [self.first_prose_clf, self.first_nonprose_clf]
        preds = []

        for flabel, fset, dvect, clf in zip(flabels, fsets, dvects, clfs):

            # If nothing to predict, skip actual prediction
            if len(fset) == 0:

            print '\tvectorizing features (pass one) ' + flabel

            # Save list structure to reconstruct after vectorization
            offsets = [len(sublist) for sublist in fset]
            for i in range(1, len(offsets)):
                offsets[i] += offsets[i - 1]

            # Vectorize features
            flattened = [item for sublist in fset for item in sublist]
            X = dvect.transform(flattened)

            print '\tpredicting    labels (pass one) ' + flabel

            # CRF requires reconstruct lists
            if self.crf_enabled:
                X = list(X)
                X = [X[i:j] for i, j in zip([0] + offsets, offsets)]
                lib = crf
                lib = sci

            # Predict IOB labels
            out = lib.predict(clf, X)

            # Format labels from output
            pred = [out[i:j] for i, j in zip([0] + offsets, offsets)]

        # Recover predictions
        plist = preds[0]
        nlist = preds[1]

        # Stitch prose and nonprose data back together
        # translate IOB labels into a readable format
        prose_iobs = []
        nonprose_iobs = []
        iobs = []
        trans = lambda l: reverse_IOB_labels[int(l)]
        for sentence in data:
            if is_prose_sentence(sentence):
                prose_iobs[-1] = map(trans, prose_iobs[-1])
                nonprose_iobs[-1] = map(trans, nonprose_iobs[-1])

        # list of list of IOB labels
        return iobs, prose_iobs, nonprose_iobs
예제 #6
파일: model.py 프로젝트: aussina/CliNER
    def first_train(self, data, Y, do_grid=False):


        Purpose: Train the first pass classifiers (for IOB chunking)

        @param data      A list of split sentences    (1 sent = 1 line from file)
        @param Y         A list of list of IOB labels (1:1 mapping with data)
        @param do_grid   A boolean indicating whether to perform a grid search

        @return          None

        print '\textracting  features (pass one)'

        # Create object that is a wrapper for the features
        feat_obj = FeatureWrapper(data)

        # Parition into prose v. nonprose
        prose    = []
        nonprose = []
        pchunks = []
        nchunks = []
        for line,labels in zip(data,Y):
            isProse,feats = feat_obj.extract_IOB_features(line)
            if isProse:
                pchunks += labels
                nchunks += labels

        # Classify both prose & nonprose
        flabels    = ['prose'             , 'nonprose'             ]
        fsets      = [prose               , nonprose               ]
        chunksets  = [pchunks             , nchunks                ]
        dvects     = [self.first_prose_vec, self.first_nonprose_vec]
        clfs       = [self.first_prose_clf, self.first_nonprose_clf]

        vectorizers = []
        classifiers = []

        for flabel,fset,chunks,dvect,clf in zip(flabels, fsets, chunksets, dvects, clfs):

            if len(fset) == 0:
                raise Exception('Training data must have %s training examples' % flabel)

            print '\tvectorizing features (pass one) ' + flabel

            # Vectorize IOB labels
            Y = [  IOB_labels[y]  for  y  in  chunks  ]

            # Save list structure to reconstruct after vectorization
            offsets = [ len(sublist) for sublist in fset ]
            for i in range(1, len(offsets)):
                offsets[i] += offsets[i-1]

            # Vectorize features
            flattened = [item for sublist in fset for item in sublist]
            X = dvect.fit_transform(flattened)

            print '\ttraining classifiers (pass one) ' + flabel

            # CRF needs reconstructed lists
            if self.crf_enabled:
                X = list(X)
                X = [ X[i:j] for i, j in zip([0] + offsets, offsets)]
                Y = [ Y[i:j] for i, j in zip([0] + offsets, offsets)]
                lib = crf
                lib = sci

            # Train classifiers
            clf  = lib.train(X, Y, do_grid)

        # Save vectorizers
        self.first_prose_vec    = vectorizers[0]
        self.first_nonprose_vec = vectorizers[1]

        # Save classifiers
        self.first_prose_clf    = classifiers[0]
        self.first_nonprose_clf = classifiers[1]
예제 #7
파일: model.py 프로젝트: aussina/CliNER
    def second_predict(self, data, inds_list):

        # If first pass predicted no concepts, then skip
        # NOTE: Special case because SVM cannot have empty input
        if sum([ len(inds) for inds in inds_list ]) == 0:
            print "first pass predicted no concepts, skipping second pass"
            return []

        # Create object that is a wrapper for the features
        feat_o = FeatureWrapper()

        print '\textracting  features (pass two)'

        # Extract features
        X = [ feat_o.concept_features(s,inds) for s,inds in zip(data,inds_list) ]
        X = reduce(concat, X)

        print '\tvectorizing features (pass two)'

        # Vectorize features
        X = self.second_vec.transform(X)

        print '\tpredicting    labels (pass two)'

        # Predict concept labels
        out = sci.predict(self.second_clf, X)

        # Line-by-line processing
        o = list(out)
        classifications = []
        for lineno,inds in enumerate(inds_list):

            # Skip empty line
            if not inds: continue

            # For each concept
            for ind in inds:

                # Get next concept
                concept = reverse_concept_labels[o.pop(0)]

                # Get start position (ex. 7th word of line)
                start = 0
                for i in range(ind):
                    start += len( data[lineno][i].split() )

                # Length of chunk
                length = len(data[lineno][ind].split())

                # Classification token
                classifications.append( (concept,lineno+1,start,start+length-1) )

        # Return classifications
        return classifications
예제 #8
파일: model.py 프로젝트: aussina/CliNER
    def first_predict(self, data):


        Purpose: Predict IOB chunks on data

        @param data.  A list of split sentences    (1 sent = 1 line from file)
        @return       A list of list of IOB labels (1:1 mapping with data)

        print '\textracting  features (pass one)'

        # Create object that is a wrapper for the features
        feat_obj = FeatureWrapper(data)

        # separate prose and nonprose data
        prose    = []
        nonprose = []
        plinenos = []
        nlinenos = []
        for i,line in enumerate(data):
            isProse,feats = feat_obj.extract_IOB_features(line)
            if isProse:

        # Classify both prose & nonprose
        flabels = ['prose'             , 'nonprose'             ]
        fsets   = [prose               , nonprose               ]
        dvects  = [self.first_prose_vec, self.first_nonprose_vec]
        clfs    = [self.first_prose_clf, self.first_nonprose_clf]
        preds   = []

        for flabel,fset,dvect,clf in zip(flabels, fsets, dvects, clfs):

            # If nothing to predict, skip actual prediction
            if len(fset) == 0:

            print '\tvectorizing features (pass one) ' + flabel

            # Save list structure to reconstruct after vectorization
            offsets = [ len(sublist) for sublist in fset ]
            for i in range(1, len(offsets)):
                offsets[i] += offsets[i-1]

            # Vectorize features
            flattened = [item for sublist in fset for item in sublist]
            X = dvect.transform(flattened)

            print '\tpredicting    labels (pass one) ' + flabel

            # CRF requires reconstruct lists
            if self.crf_enabled:
                X = list(X)
                X = [ X[i:j] for i, j in zip([0] + offsets, offsets)]
                lib = crf
                lib = sci

            # Predict IOB labels
            out = lib.predict(clf, X)

            # Format labels from output
            pred = [out[i:j] for i, j in zip([0] + offsets, offsets)]

        # Recover predictions
        plist = preds[0]
        nlist = preds[1]

        # Stitch prose and nonprose data back together
        # translate IOB labels into a readable format
        prose_iobs    = []
        nonprose_iobs = []
        iobs          = []
        trans = lambda l: reverse_IOB_labels[int(l)]
        for sentence in data:
            if prose_sentence(sentence):
                prose_iobs.append( plist.pop(0) )
                prose_iobs[-1] = map(trans, prose_iobs[-1])
                iobs.append( prose_iobs[-1] )
                nonprose_iobs.append( nlist.pop(0) )
                nonprose_iobs[-1] = map(trans, nonprose_iobs[-1])
                iobs.append( nonprose_iobs[-1] )

        # list of list of IOB labels
        return iobs, prose_iobs, nonprose_iobs