Exemplo n.º 1
0
def main_tag(featureSet, options):
    labelCounter, featCounter = BookKeeper(), BookKeeper()
    labelCounter.readFromFile('{0}.labelNumbers'.format(options.modelName))
    featCounter.readFromFile('{0}.featureNumbers'.format(options.modelName))
    optionsDict = vars(options)
    optionsDict['labelCounter'] = labelCounter
    optionsDict['featCounter'] = featCounter
    optionsDict['modelFile'] = '{0}.model'.format(options.modelName)
    tagger = Tagger(featureSet, optionsDict)
    if options.inFeatFile:
        tagger_func = lambda: tagger.tag_features(options.inFeatFile)
        writer_func = lambda s, c: writeSentence(s, comment=c)
    elif options.input_dir:
        assert isdir(options.input_dir), "--input-dir must be a directory"
        out_dir = "{}_out".format(options.input_dir)
        os.mkdir(out_dir)
        tagger_func = lambda: tagger.tag_dir(options.input_dir)
        writer_func = lambda s, c: writeSentence(
            s, out=open(join(out_dir, '{}.tagged'.format(c)), 'a'))
    else:
        tagger_func = lambda: tagger.tag_corp(sys.stdin)
        writer_func = lambda s, c: writeSentence(s, comment=c)

    for sen, other in tagger_func():
        writer_func(sen, other)
Exemplo n.º 2
0
def main_tag(featureSet, options):
    labelCounter, featCounter = BookKeeper(), BookKeeper()
    labelCounter.readFromFile('{0}.labelNumbers'.format(options.modelName))
    featCounter.readFromFile('{0}.featureNumbers'.format(options.modelName))
    optionsDict = vars(options)
    optionsDict['labelCounter'] = labelCounter
    optionsDict['featCounter'] = featCounter
    optionsDict['modelFile'] = '{0}.model'.format(options.modelName)
    tagger = Tagger(featureSet, optionsDict)
    if options.inFeatFile:
        tagger_func = lambda: tagger.tag_features(options.inFeatFile)
        writer_func = lambda s, c: writeSentence(s, comment=c)
    elif options.input_dir:
        assert isdir(options.input_dir), "--input-dir must be a directory"
        out_dir = "{}_out".format(options.input_dir)
        os.mkdir(out_dir)
        tagger_func = lambda: tagger.tag_dir(options.input_dir)
        writer_func = lambda s, c: writeSentence(
            s, out=open(join(out_dir, '{}.tagged'.format(c)), 'a'))
    else:
        tagger_func = lambda: tagger.tag_corp(sys.stdin)
        writer_func = lambda s, c: writeSentence(s, comment=c)

    for sen, other in tagger_func():
        writer_func(sen, other)
Exemplo n.º 3
0
 def __init__(self, features, transModel, options):
     self._features = features
     self._dataSizes = options['dataSizes']
     self._transProbs = transModel
     print('loading observation model...', end='', file=sys.stderr, flush=True)
     self._model = joblib.load('{0}'.format(options['modelFileName']))
     self._featCounter = BookKeeper(options['featCounterFileName'])
     self._labelCounter = BookKeeper(options['labelCounterFileName'])
     print('done', file=sys.stderr, flush=True)
Exemplo n.º 4
0
 def __init__(self, features, options):
     self.modelName = options['modelName']
     self.parameters = options['trainParams']
     self.cutoff = options['cutoff']
     self.features = features
     self.labels = []
     self.contexts = []
     self.labelCounter = BookKeeper()
     self.featCounter = BookKeeper()
     self.usedFeats = None
     if options['usedFeats']:
         self.usedFeats = set([line.strip()
                               for line in options['usedFeats']])
Exemplo n.º 5
0
 def __init__(self, features, options):
     self.modelName = options['modelName']
     self.parameters = options['trainParams']
     self.cutoff = options['cutoff']
     self.features = features
     self.labels = []
     self.contexts = []
     self.labelCounter = BookKeeper()
     self.featCounter = BookKeeper()
     self.usedFeats = None
     if options['usedFeats']:
         self.usedFeats = set(
             [line.strip() for line in options['usedFeats']])
Exemplo n.º 6
0
    def __init__(self, features, options):

        # Set clasifier algorithm here
        parameters = dict()  # dict(solver='lbfgs')
        solver = LogisticRegression

        # Possible alternative solvers:
        # parameters = {'loss':'modified_huber',  'n_jobs': -1}
        # solver = SGDClassifier

        # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
        # parameters = {'kernel': 'rbf', 'probability': True}
        # solver = SVC

        # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
        # parameters = {'kernel': 'linear', 'probability': True}
        # solver = OneVsRestClassifier(SVC(**parameters))  # XXX won't work because ** in parameters...

        self._model = solver(**parameters)
        self._dataSizes = options['dataSizes']
        self._tagField = options['tagField']
        self._modelFileName = options['modelFileName']
        self._parameters = options['trainParams']
        self._cutoff = options['cutoff']
        self._featCounterFileName = options['featCounterFileName']
        self._labelCounterFileName = options['labelCounterFileName']
        self._features = features

        self._tokCount = -1  # Index starts from 0

        self._rows = array(self._dataSizes['rows'])
        self._cols = array(self._dataSizes['cols'])
        self._data = array(self._dataSizes['data'])
        self._labels = array(self._dataSizes['labels'])
        self._sentEnd = array(
            self._dataSizes['sentEnd'])  # Keep track of sentence boundaries
        self._matrix = None

        self._featCounter = BookKeeper()
        self._labelCounter = BookKeeper()
        self._usedFeats = None
        if 'usedFeats' in options and options['usedFeats']:
            self._usedFeats = {
                line.strip()
                for line in open(options['usedFeats'], encoding='UTF-8')
            }
Exemplo n.º 7
0
class Trainer():
    def __init__(self, features, options):
        self.modelName = options['modelName']
        self.parameters = options['trainParams']
        self.cutoff = options['cutoff']
        self.features = features
        self.labels = []
        self.contexts = []
        self.labelCounter = BookKeeper()
        self.featCounter = BookKeeper()
        self.usedFeats = None
        if options['usedFeats']:
            self.usedFeats = set([line.strip()
                                  for line in options['usedFeats']])
    
    def save(self):
        sys.stderr.write('saving model...')
        save_model(self.modelName+'.model', self.model)
        sys.stderr.write('done\nsaving label and feature lists...')
        self.labelCounter.saveToFile(self.modelName+'.labelNumbers')
        self.featCounter.saveToFile(self.modelName+'.featureNumbers')
        sys.stderr.write('done\n')

    def writeFeats(self, fileName):
        """obsolete"""
        featFile = open(fileName, 'w')
        for i, context in enumerate(self.contexts):
            label = self.labelCounter.noToFeat[self.labels[i]]
            feats = [self.featCounter.noToFeat[c]
                     for c in [feat[0] for feat in context]]
            featFile.write('{0}\t{1}\n'.format(label, ' '.join(feats)))
    
    def reduceContexts(self):
        sys.stderr.write('reducing training events...')
        self.contexts = [dict([(number, value)
                              for number, value in context.iteritems()
                              if self.featCounter.noToFeat.has_key(number)])
                         for context in self.contexts]
        sys.stderr.write('done!\n')

    def cutoffFeats(self):
        if self.cutoff<2:
            return
        sys.stderr.write('discarding features with\
        less than {0} occurences...'.format(self.cutoff))      
        self.featCounter.cutoff(self.cutoff)
        sys.stderr.write('done!\n')
        self.reduceContexts()
            
    def getEvents(self, data, out_file_name):
        sys.stderr.write('featurizing sentences...')
        senCount = 0
        out_file = None
        if out_file_name:
            out_file = open(out_file_name, 'w')
        for sen, _ in sentenceIterator(data):
            senCount+=1
            sentenceFeats = featurizeSentence(sen, self.features)
            for c, tok in enumerate(sen):
                tokFeats = sentenceFeats[c]
                if self.usedFeats:
                    tokFeats = [feat for feat in tokFeats
                                if feat in self.usedFeats]
                if out_file:
                    out_file.write(tok[-1]+'\t'+' '.join(tokFeats)+'\n')
                self.addContext(tokFeats, tok[-1])
            if out_file:
                out_file.write('\n')
            if senCount % 1000 == 0:
                sys.stderr.write(str(senCount)+'...')

        sys.stderr.write(str(senCount)+'...done!\n')

    def getEventsFromFile(self, fileName):
        for line in file(fileName):
            if line == '\n': continue
            l = line.strip().split()
            label, feats = l[0], l[1:]
            self.addContext(feats, label)

    def addContext(self, tokFeats, label):
        tokFeats.sort()
        """features are sorted to ensure identical output
           no matter where the features are coming from"""
        featNumbers = set([self.featCounter.getNo(feat)
                           for feat in tokFeats])
    
        context = ((c_int*2)*len(featNumbers))()
        for i, no in enumerate(featNumbers):
            context[i][1]=1
            context[i][0]=no
        labelNumber = self.labelCounter.getNo(label)
        self.contexts.append(context)
        self.labels.append(labelNumber)
                       
        
    def train(self):
        sys.stderr.write('creating training problem...')
        prob = problem(self.labels, self.contexts)
        sys.stderr.write('done\ntraining with option(s) "'+self.parameters+'"...')
        self.model = train(prob, parameter(self.parameters))
        sys.stderr.write('done\n')
Exemplo n.º 8
0
class Tagger:
    def __init__(self, features, transModel, options):
        self._features = features
        self._dataSizes = options['dataSizes']
        self._transProbs = transModel
        print('loading observation model...', end='', file=sys.stderr, flush=True)
        self._model = joblib.load('{0}'.format(options['modelFileName']))
        self._featCounter = BookKeeper(options['featCounterFileName'])
        self._labelCounter = BookKeeper(options['labelCounterFileName'])
        print('done', file=sys.stderr, flush=True)

    def printWeights(self, n=100, outputStream=sys.stdout):
        coefs = self._model.coef_
        labelNoToName = self._labelCounter.noToName
        featNoToName = self._featCounter.noToName
        sortedFeats = sorted(featNoToName.items())
        for i, label in sorted(labelNoToName.items()):
            columns = ['{0}:{1}'.format(w, feat) for w, (no, feat) in sorted(zip(coefs[i, :], sortedFeats),
                                                                             reverse=True)]
            print('{0}\t{1}'.format(label, '\t'.join(columns[:n])), file=outputStream)  # Best
            # Worst -> Negative correlation
            print('{0}\t{1}'.format(label, '\t'.join(sorted(columns[-n:], reverse=True))), file=outputStream)

    def tagFeatures(self, data):
        senFeats = []
        senCount = 0
        for line in data:
            line = line.strip()
            if len(line) == 0:
                senCount += 1
                tagging = self._tagSenFeats(senFeats)
                yield [[tag] for tag in tagging]
                senFeats = []
                if senCount % 1000 == 0:
                    print('{0}...'.format(senCount), end='', file=sys.stderr, flush=True)
            senFeats.append(line.split())
        print('{0}...done'.format(senCount), file=sys.stderr, flush=True)

    def tagDir(self, dirName):
        for fn in os.listdir(dirName):
            print('processing file {0}...'.format(fn), end='', file=sys.stderr, flush=True)
            for sen, _ in self.tagCorp(open(os.path.join(dirName, fn), encoding='UTF-8')):
                yield sen, fn

    def tagCorp(self, inputStream=sys.stdin):
        senCount = 0
        for sen, comment in sentenceIterator(inputStream):
            senCount += 1
            senFeats = featurizeSentence(sen, self._features)
            bestTagging = self._tagSenFeats(senFeats)
            taggedSen = [tok + [bestTagging[c]] for c, tok in enumerate(sen)]  # Add tagging to sentence
            yield taggedSen, comment
            if senCount % 1000 == 0:
                print('{0}...'.format(senCount), end='', file=sys.stderr, flush=True)
        print('{0}...done'.format(senCount), file=sys.stderr, flush=True)

    def _getTagProbsByPos(self, senFeats):
        # Get Sentence Features translated to numbers and contexts in two steps
        getNoTag = self._featCounter.getNoTag
        featNumbers = [{getNoTag(feat) for feat in feats if getNoTag(feat) is not None} for feats in senFeats]

        rows = []
        cols = []
        data = []
        for rownum, featNumberSet in enumerate(featNumbers):
            for featNum in featNumberSet:
                rows.append(rownum)
                cols.append(featNum)
                data.append(1)
        contexts = csr_matrix((data, (rows, cols)), shape=(len(featNumbers), self._featCounter.numOfNames()),
                              dtype=self._dataSizes['dataNP'])
        tagProbsByPos = [{self._labelCounter.noToName[i]: prob for i, prob in enumerate(probDist)}
                         for probDist in self._model.predict_proba(contexts)]
        return tagProbsByPos

    def toCRFsuite(self, inputStream, outputStream=sys.stdout):
        senCount = 0
        getNoTag = self._featCounter.getNoTag
        featnoToName = self._featCounter.noToName
        for sen, comment in sentenceIterator(inputStream):
            senCount += 1
            senFeats = featurizeSentence(sen, self._features)
            # Get Sentence Features translated to numbers and contexts in two steps
            for featNumberSet in ({getNoTag(feat) for feat in feats if getNoTag(feat) is not None}
                                  for feats in senFeats):
                print('\t'.join(featnoToName[featNum].replace(':', 'colon') for featNum in featNumberSet),
                      file=outputStream)
            print(file=outputStream)  # Sentence separator blank line
            if senCount % 1000 == 0:
                print('{0}...'.format(str(senCount)), end='', file=sys.stderr, flush=True)
        print('{0}...done'.format(str(senCount)), file=sys.stderr, flush=True)

    def _tagSenFeats(self, senFeats):
        return self._transProbs.tagSent(self._getTagProbsByPos(senFeats))
Exemplo n.º 9
0
class Trainer:
    def __init__(self, features, options):

        # Set clasifier algorithm here
        parameters = dict()  # dict(solver='lbfgs')
        solver = LogisticRegression

        # Possible alternative solvers:
        # parameters = {'loss':'modified_huber',  'n_jobs': -1}
        # solver = SGDClassifier

        # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
        # parameters = {'kernel': 'rbf', 'probability': True}
        # solver = SVC

        # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
        # parameters = {'kernel': 'linear', 'probability': True}
        # solver = OneVsRestClassifier(SVC(**parameters))  # XXX won't work because ** in parameters...

        self._model = solver(**parameters)
        self._dataSizes = options['dataSizes']
        self._tagField = options['tagField']
        self._modelFileName = options['modelFileName']
        self._parameters = options['trainParams']
        self._cutoff = options['cutoff']
        self._featCounterFileName = options['featCounterFileName']
        self._labelCounterFileName = options['labelCounterFileName']
        self._features = features

        self._tokCount = -1  # Index starts from 0

        self._rows = array(self._dataSizes['rows'])
        self._cols = array(self._dataSizes['cols'])
        self._data = array(self._dataSizes['data'])
        self._labels = array(self._dataSizes['labels'])
        self._sentEnd = array(
            self._dataSizes['sentEnd'])  # Keep track of sentence boundaries
        self._matrix = None

        self._featCounter = BookKeeper()
        self._labelCounter = BookKeeper()
        self._usedFeats = None
        if 'usedFeats' in options and options['usedFeats']:
            self._usedFeats = {
                line.strip()
                for line in open(options['usedFeats'], encoding='UTF-8')
            }

    def save(self):
        print('saving model...', end='', file=sys.stderr, flush=True)
        joblib.dump(self._model, '{0}'.format(self._modelFileName), compress=3)
        print('done\nsaving feature and label lists...',
              end='',
              file=sys.stderr,
              flush=True)
        self._featCounter.save(self._featCounterFileName)
        self._labelCounter.save(self._labelCounterFileName)
        print('done', file=sys.stderr, flush=True)

    def _updateSentEnd(self, sentEnds, rowNums):
        newEnds = array(self._dataSizes['sentEnd'])
        vbeg = 0
        for end in sentEnds:
            vend = -1
            for i, e in enumerate(rowNums[vbeg:]):
                if e <= end:
                    vend = vbeg + i
                else:
                    break
            if vend > 0:
                newEnds.append(vend)
                vbeg = vend + 1
        return newEnds

    def _convertToNPArray(self):
        rowsNP = np.array(self._rows, dtype=self._dataSizes['rowsNP'])
        colsNP = np.array(self._cols, dtype=self._dataSizes['cols'])
        dataNP = np.array(self._data, dtype=self._dataSizes['data'])
        labelsNP = np.array(self._labels, dtype=self._dataSizes['labels'])
        del self._rows
        del self._cols
        del self._data
        del self._labels
        self._rows = rowsNP
        self._cols = colsNP
        self._data = dataNP
        self._labels = labelsNP

    def _makeSparseArray(self, rowNum, colNum):
        print('creating training problem...',
              end='',
              file=sys.stderr,
              flush=True)
        matrix = csr_matrix((self._data, (self._rows, self._cols)),
                            shape=(rowNum, colNum),
                            dtype=self._dataSizes['data'])
        del self._rows
        del self._cols
        del self._data
        print('done!', file=sys.stderr, flush=True)
        return matrix

    def cutoffFeats(self):
        self._convertToNPArray()
        colNum = self._featCounter.numOfNames()
        if self._cutoff < 2:
            self._matrix = self._makeSparseArray(self._tokCount, colNum)
        else:
            print(
                'discarding features with less than {0} occurences...'.format(
                    self._cutoff),
                end='',
                file=sys.stderr,
                flush=True)

            toDelete = self._featCounter.cutoff(self._cutoff)
            print('done!\nreducing training events by {0}...'.format(
                len(toDelete)),
                  end='',
                  file=sys.stderr,
                  flush=True)
            # ...that are not in featCounter anymore
            indicesToKeepNP = np.fromiter(
                (ind for ind, featNo in enumerate(self._cols)
                 if featNo not in toDelete),
                dtype=self._dataSizes['cols'])
            del toDelete

            # Reduce cols
            colsNPNew = self._cols[indicesToKeepNP]
            del self._cols
            self._cols = colsNPNew

            # Reduce data
            dataNPNew = self._data[indicesToKeepNP]
            del self._data
            self._data = dataNPNew

            # Reduce rows
            rowsNPNew = self._rows[indicesToKeepNP]
            rowNumKeep = np.unique(rowsNPNew)
            rowNum = rowNumKeep.shape[0]
            colNum = indicesToKeepNP.max() + 1
            del self._rows
            self._rows = rowsNPNew
            del indicesToKeepNP

            # Reduce labels
            labelsNPNew = self._labels[rowNumKeep]
            del self._labels
            self._labels = labelsNPNew

            # Update sentence end markers
            newEnd = self._updateSentEnd(self._sentEnd, rowNumKeep)
            del self._sentEnd
            self._sentEnd = newEnd
            del rowNumKeep

            print('done!', file=sys.stderr, flush=True)
            matrix = self._makeSparseArray(rowNum, colNum)
            print('updating indices...', end='', file=sys.stderr, flush=True)

            # Update rowNos
            rows, _ = matrix.nonzero()
            matrixNew = matrix[np.unique(rows), :]
            del matrix
            del rows

            # Update featNos
            _, cols = matrixNew.nonzero()
            self._matrix = matrixNew[:, np.unique(cols)]
            del matrixNew
            del cols

            print('done!', file=sys.stderr, flush=True)

    # Input need featurizing
    def getEvents(self, data):
        print('featurizing sentences...', end='', file=sys.stderr, flush=True)
        senCount = 0
        tokIndex = -1  # Index starts from 0
        for sen, _ in sentenceIterator(data):
            senCount += 1
            sentenceFeats = featurizeSentence(sen, self._features)
            for c, tok in enumerate(sen):
                tokIndex += 1
                tokFeats = sentenceFeats[c]
                if self._usedFeats:
                    tokFeats = [
                        feat for feat in tokFeats if feat in self._usedFeats
                    ]
                self._addContext(tokFeats, tok[self._tagField], tokIndex)
            self._sentEnd.append(tokIndex)
            if senCount % 1000 == 0:
                print('{0}...'.format(str(senCount)),
                      end='',
                      file=sys.stderr,
                      flush=True)

        self._tokCount = tokIndex + 1
        print('{0}...done!'.format(str(senCount)), file=sys.stderr, flush=True)

    # Already featurized input
    def getEventsFromFile(self, data):
        tokIndex = -1  # Index starts from 0
        for line in data:
            line = line.strip()
            if len(line) > 0:
                tokIndex += 1
                l = line.split()
                label, feats = l[0], l[1:]
                self._addContext(feats, label, tokIndex)
            self._sentEnd.append(tokIndex)
        self._tokCount = tokIndex + 1

    def _addContext(self, tokFeats, label, curTok):
        rowsAppend = self._rows.append
        colsAppend = self._cols.append
        dataAppend = self._data.append

        # Features are sorted to ensure identical output no matter where the features are coming from
        for featNumber in {
                self._featCounter.getNoTrain(feat)
                for feat in sorted(tokFeats)
        }:
            rowsAppend(curTok)
            colsAppend(featNumber)
            dataAppend(1)

        self._labels.append(self._labelCounter.getNoTrain(label))

    # Counting zero elements can be really slow...
    def mostInformativeFeatures(self,
                                outputStream=sys.stdout,
                                n=-1,
                                countZero=False):
        # Compute min(P(feature=value|label1), for any label1)/max(P(feature=value|label2), for any label2)
        # (using contitional probs using joint probabilities) as in NLTK (Bird et al. 2009):
        # P(feature=value|label) = P(feature=value, label)/P(label)
        # P(feature=value, label) = C(feature=value, label)/C(feature=value)
        # P(label) = C(label)/sum_i(C(label_i))
        #
        # P(feature=value|label) = (C(feature=value, label)/C(feature=value))/(C(label)/sum_i(C(label_i))) =
        # (C(feature=value, label)*sum_i(C(label_i)))/(C(feature=value)*C(label))
        #
        # min(P(feature=value|label1), for any label1)/max(P(feature=value|label2), for any label2) =
        #
        # min((C(feature=value, label1)*sum_i(C(label_i)))/(C(feature=value)*C(label1)), for any label1)/
        # max((C(feature=value, label2)*sum_i(C(label_i)))/(C(feature=value)*C(label2)), for any label2) =
        #
        # (sum_i(C(label_i))/C(feature=value))*min(C(feature=value, label1)/C(label1)), for any label1)/
        # (sum_i(C(label_i))/C(feature=value))*max(C(feature=value, label2)/C(label2)), for any label2) =
        #
        # min(C(feature=value, label1)/C(label1), for any label1)/
        # max(C(feature=value, label2)/C(label2), for any label2)
        matrix = self._matrix  # For easiser handling
        self._featCounter.makenoToName()
        self._labelCounter.makenoToName()
        featnoToName = self._featCounter.noToName
        labelnoToName = self._labelCounter.noToName
        labels = self._labels  # indexed by token rows (row = token number, column = feature number)
        featValCounts = defaultdict(Counter)  # feat, val -> label: count

        if countZero:
            # Every index (including zeros to consider negative correlation)
            for feat in range(matrix.shape[1]):
                for tok in range(matrix.shape[0]):
                    featValCounts[feat, matrix[tok, feat]][labels[tok]] += 1
        else:
            matrix = matrix.tocoo()
            # Every nonzero index
            for tok, feat, val in zip(matrix.row, matrix.col, matrix.data):
                featValCounts[feat, val][labels[tok]] += 1
        del matrix

        # (C(label2), for any label2)
        labelCounts = Counter()
        for k, v in zip(*np.unique(self._labels, return_counts=True)):
            labelCounts[k] = v

        numOfLabels = len(labelCounts)
        maxprob = defaultdict(lambda: 0.0)
        minprob = defaultdict(lambda: 1.0)
        features = set()
        # For every (feature, val) touple (that has nonzero count)
        for feature, counts in featValCounts.items():
            # For every label label...
            features.add(feature)
            for label, count in counts.items():
                # prob can only be 0 if the nominator is 0, but this case is already filtered in the Counter...
                prob = count / labelCounts[label]
                maxprob[feature] = max(prob, maxprob[feature])
                minprob[feature] = min(prob, minprob[feature])

        # Convert features to a list, & sort it by how informative features are.
        """
        From NTLK docs:
        For the purpose of this function, the
        informativeness of a feature ``(fname,fval)`` is equal to the
        highest value of P(fname=fval|label), for any label, divided by
        the lowest value of P(fname=fval|label), for any label:

        |  max[ P(fname=fval|label1) / P(fname=fval|label2) ]
        """
        print('"Feature name"=Value (True/False)',
              'Sum of occurences',
              'Counts per label',
              'Probability per label',
              'Max prob.:Min prob.=Ratio:1.0',
              sep='\t',
              file=outputStream)  # Print header (legend)
        # To avoid division by zero...
        for feature in sorted(features,
                              key=lambda feature_: minprob[feature_] / maxprob[
                                  feature_])[:n]:
            sumOccurences = sum(featValCounts[feature].values())
            if len(featValCounts[feature]) < numOfLabels:
                ratio = 'INF'
            else:
                ratio = maxprob[feature] / minprob[feature]
            # NLTK notation
            # print('{0:50} = {1:} {2:6} : {3:-6} = {4} : 1.0'.format(featnoToName(feature[0]), feature[1],
            #                                                                maxprob[feature],
            #                                                                minprob[feature], ratio))
            # More detailed notation
            print('"{0:50s}"={1}\t{2}\t{3}\t{4}\t{5:6}:{6:-6}={7}:1.0'.format(
                featnoToName[feature[0]], bool(feature[1]), sumOccurences,
                '/'.join(
                    ('{0}:{1}'.format(labelnoToName[l], c)
                     for l, c in featValCounts[feature].items())), '/'.join(
                         ('{0}:{1:.8f}'.format(labelnoToName[l],
                                               c / labelCounts[l])
                          for l, c in featValCounts[feature].items())),
                maxprob[feature], minprob[feature], ratio),
                  file=outputStream)

    def toCRFsuite(self, outputStream=sys.stdout):
        self._featCounter.makenoToName()
        self._labelCounter.makenoToName()
        featnoToName = self._featCounter.noToName
        labelnoToName = self._labelCounter.noToName
        sentEnd = self._sentEnd
        matrix = self._matrix.tocsr()
        labels = self._labels
        beg = 0
        for end in sentEnd:
            for row in range(beg, end + 1):
                print('{0}\t{1}'.format(
                    labelnoToName[labels[row]],
                    '\t'.join(featnoToName[col].replace(':', 'colon')
                              for col in matrix[row, :].nonzero()[1])),
                      file=outputStream)
            print(file=outputStream)  # Sentence separator blank line
            beg = end + 1

    def train(self):
        print('training with option(s) "{0}"...'.format(self._parameters),
              end='',
              file=sys.stderr,
              flush=True)
        _ = self._model.fit(self._matrix, self._labels)
        print('done', file=sys.stderr, flush=True)
Exemplo n.º 10
0
class Trainer():
    def __init__(self, features, options):
        self.modelName = options['modelName']
        self.parameters = options['trainParams']
        self.cutoff = options['cutoff']
        self.features = features
        self.labels = []
        self.contexts = []
        self.labelCounter = BookKeeper()
        self.featCounter = BookKeeper()
        self.usedFeats = None
        if options['usedFeats']:
            self.usedFeats = set(
                [line.strip() for line in options['usedFeats']])

    def save(self):
        sys.stderr.write('saving model...')
        save_model(self.modelName + '.model', self.model)
        sys.stderr.write('done\nsaving label and feature lists...')
        self.labelCounter.saveToFile(self.modelName + '.labelNumbers')
        self.featCounter.saveToFile(self.modelName + '.featureNumbers')
        sys.stderr.write('done\n')

    def writeFeats(self, fileName):
        """obsolete"""
        featFile = open(fileName, 'w')
        for i, context in enumerate(self.contexts):
            label = self.labelCounter.noToFeat[self.labels[i]]
            feats = [
                self.featCounter.noToFeat[c]
                for c in [feat[0] for feat in context]
            ]
            featFile.write('{0}\t{1}\n'.format(label, ' '.join(feats)))

    def reduceContexts(self):
        sys.stderr.write('reducing training events...')
        self.contexts = [
            dict([(number, value) for number, value in context.iteritems()
                  if self.featCounter.noToFeat.has_key(number)])
            for context in self.contexts
        ]
        sys.stderr.write('done!\n')

    def cutoffFeats(self):
        if self.cutoff < 2:
            return
        sys.stderr.write('discarding features with\
        less than {0} occurences...'.format(self.cutoff))
        self.featCounter.cutoff(self.cutoff)
        sys.stderr.write('done!\n')
        self.reduceContexts()

    def getEvents(self, data, out_file_name):
        sys.stderr.write('featurizing sentences...')
        senCount = 0
        out_file = None
        if out_file_name:
            out_file = open(out_file_name, 'w')
        for sen, _ in sentenceIterator(data):
            senCount += 1
            sentenceFeats = featurizeSentence(sen, self.features)
            for c, tok in enumerate(sen):
                tokFeats = sentenceFeats[c]
                if self.usedFeats:
                    tokFeats = [
                        feat for feat in tokFeats if feat in self.usedFeats
                    ]
                if out_file:
                    out_file.write(tok[-1] + '\t' + ' '.join(tokFeats) + '\n')
                self.addContext(tokFeats, tok[-1])
            if out_file:
                out_file.write('\n')
            if senCount % 1000 == 0:
                sys.stderr.write(str(senCount) + '...')

        sys.stderr.write(str(senCount) + '...done!\n')

    def getEventsFromFile(self, fileName):
        for line in file(fileName):
            if line == '\n': continue
            l = line.strip().split()
            label, feats = l[0], l[1:]
            self.addContext(feats, label)

    def addContext(self, tokFeats, label):
        tokFeats.sort()
        """features are sorted to ensure identical output
           no matter where the features are coming from"""
        featNumbers = set([self.featCounter.getNo(feat) for feat in tokFeats])

        context = ((c_int * 2) * len(featNumbers))()
        for i, no in enumerate(featNumbers):
            context[i][1] = 1
            context[i][0] = no
        labelNumber = self.labelCounter.getNo(label)
        self.contexts.append(context)
        self.labels.append(labelNumber)

    def train(self):
        sys.stderr.write('creating training problem...')
        prob = problem(self.labels, self.contexts)
        sys.stderr.write('done\ntraining with option(s) "' + self.parameters +
                         '"...')
        self.model = train(prob, parameter(self.parameters))
        sys.stderr.write('done\n')