Exemplo n.º 1
0
def main_tag(featureSet, options):
    labelCounter, featCounter = BookKeeper(), BookKeeper()
    labelCounter.readFromFile('{0}.labelNumbers'.format(options.modelName))
    featCounter.readFromFile('{0}.featureNumbers'.format(options.modelName))
    optionsDict = vars(options)
    optionsDict['labelCounter'] = labelCounter
    optionsDict['featCounter'] = featCounter
    optionsDict['modelFile'] = '{0}.model'.format(options.modelName)
    tagger = Tagger(featureSet, optionsDict)
    if options.inFeatFile:
        tagger_func = lambda: tagger.tag_features(options.inFeatFile)
        writer_func = lambda s, c: writeSentence(s, comment=c)
    elif options.input_dir:
        assert isdir(options.input_dir), "--input-dir must be a directory"
        out_dir = "{}_out".format(options.input_dir)
        os.mkdir(out_dir)
        tagger_func = lambda: tagger.tag_dir(options.input_dir)
        writer_func = lambda s, c: writeSentence(
            s, out=open(join(out_dir, '{}.tagged'.format(c)), 'a'))
    else:
        tagger_func = lambda: tagger.tag_corp(sys.stdin)
        writer_func = lambda s, c: writeSentence(s, comment=c)

    for sen, other in tagger_func():
        writer_func(sen, other)
Exemplo n.º 2
0
 def __init__(self, features, transModel, options):
     self._features = features
     self._dataSizes = options['dataSizes']
     self._transProbs = transModel
     print('loading observation model...', end='', file=sys.stderr, flush=True)
     self._model = joblib.load('{0}'.format(options['modelFileName']))
     self._featCounter = BookKeeper(options['featCounterFileName'])
     self._labelCounter = BookKeeper(options['labelCounterFileName'])
     print('done', file=sys.stderr, flush=True)
Exemplo n.º 3
0
 def __init__(self, features, options):
     self.modelName = options['modelName']
     self.parameters = options['trainParams']
     self.cutoff = options['cutoff']
     self.features = features
     self.labels = []
     self.contexts = []
     self.labelCounter = BookKeeper()
     self.featCounter = BookKeeper()
     self.usedFeats = None
     if options['usedFeats']:
         self.usedFeats = set(
             [line.strip() for line in options['usedFeats']])
Exemplo n.º 4
0
    def __init__(self, features, options):

        # Set clasifier algorithm here
        parameters = dict()  # dict(solver='lbfgs')
        solver = LogisticRegression

        # Possible alternative solvers:
        # parameters = {'loss':'modified_huber',  'n_jobs': -1}
        # solver = SGDClassifier

        # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
        # parameters = {'kernel': 'rbf', 'probability': True}
        # solver = SVC

        # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
        # parameters = {'kernel': 'linear', 'probability': True}
        # solver = OneVsRestClassifier(SVC(**parameters))  # XXX won't work because ** in parameters...

        self._model = solver(**parameters)
        self._dataSizes = options['dataSizes']
        self._tagField = options['tagField']
        self._modelFileName = options['modelFileName']
        self._parameters = options['trainParams']
        self._cutoff = options['cutoff']
        self._featCounterFileName = options['featCounterFileName']
        self._labelCounterFileName = options['labelCounterFileName']
        self._features = features

        self._tokCount = -1  # Index starts from 0

        self._rows = array(self._dataSizes['rows'])
        self._cols = array(self._dataSizes['cols'])
        self._data = array(self._dataSizes['data'])
        self._labels = array(self._dataSizes['labels'])
        self._sentEnd = array(
            self._dataSizes['sentEnd'])  # Keep track of sentence boundaries
        self._matrix = None

        self._featCounter = BookKeeper()
        self._labelCounter = BookKeeper()
        self._usedFeats = None
        if 'usedFeats' in options and options['usedFeats']:
            self._usedFeats = {
                line.strip()
                for line in open(options['usedFeats'], encoding='UTF-8')
            }