예제 #1
0
파일: huntag.py 프로젝트: zbxzc35/HunTag
def main_bigramTrain(options, input):
    bigramModel = Bigram(0.000000000000001)
    for sen, _ in sentenceIterator(input):
        tags = [tok[options.tagField] for tok in sen]
        bigramModel.obsSequence(tags)
    bigramModel.count()
    bigramModel.writeToFile(options.bigramModelFile)
예제 #2
0
def main_bigramTrain(options, input):
    bigramModel = Bigram(0.000000000000001)
    for sen, _ in sentenceIterator(input):
        tags = [tok[options.tagField] for tok in sen]
        bigramModel.obsSequence(tags)
    bigramModel.count()
    bigramModel.writeToFile(options.bigramModelFile)
예제 #3
0
 def tagCorp(self, inputStream=sys.stdin):
     senCount = 0
     for sen, comment in sentenceIterator(inputStream):
         senCount += 1
         senFeats = featurizeSentence(sen, self._features)
         bestTagging = self._tagSenFeats(senFeats)
         taggedSen = [tok + [bestTagging[c]] for c, tok in enumerate(sen)]  # Add tagging to sentence
         yield taggedSen, comment
         if senCount % 1000 == 0:
             print('{0}...'.format(senCount), end='', file=sys.stderr, flush=True)
     print('{0}...done'.format(senCount), file=sys.stderr, flush=True)
예제 #4
0
파일: tagger.py 프로젝트: kszucs/HunTag
 def tag_corp(self, input):
     senCount = 0
     for sen, comment in sentenceIterator(input):
         senCount += 1
         #sys.stderr.write(str(sen)+'\n')
         #sys.stderr.flush()
         senFeats = featurizeSentence(sen, self.featureSet)
         bestTagging = self.tag_sen_feats(senFeats)
         taggedSen = addTagging(sen, bestTagging)
         yield taggedSen, comment
         if senCount % 1000 == 0:
             sys.stderr.write(str(senCount)+'...')
     sys.stderr.write(str(senCount)+'...done\n')
예제 #5
0
 def tag_corp(self, input):
     senCount = 0
     for sen, comment in sentenceIterator(input):
         senCount += 1
         #sys.stderr.write(str(sen)+'\n')
         #sys.stderr.flush()
         senFeats = featurizeSentence(sen, self.featureSet)
         bestTagging = self.tag_sen_feats(senFeats)
         taggedSen = addTagging(sen, bestTagging)
         yield taggedSen, comment
         if senCount % 1000 == 0:
             sys.stderr.write(str(senCount) + '...')
     sys.stderr.write(str(senCount) + '...done\n')
예제 #6
0
 def toCRFsuite(self, inputStream, outputStream=sys.stdout):
     senCount = 0
     getNoTag = self._featCounter.getNoTag
     featnoToName = self._featCounter.noToName
     for sen, comment in sentenceIterator(inputStream):
         senCount += 1
         senFeats = featurizeSentence(sen, self._features)
         # Get Sentence Features translated to numbers and contexts in two steps
         for featNumberSet in ({getNoTag(feat) for feat in feats if getNoTag(feat) is not None}
                               for feats in senFeats):
             print('\t'.join(featnoToName[featNum].replace(':', 'colon') for featNum in featNumberSet),
                   file=outputStream)
         print(file=outputStream)  # Sentence separator blank line
         if senCount % 1000 == 0:
             print('{0}...'.format(str(senCount)), end='', file=sys.stderr, flush=True)
     print('{0}...done'.format(str(senCount)), file=sys.stderr, flush=True)
예제 #7
0
    def getEvents(self, data, out_file_name):
        sys.stderr.write('featurizing sentences...')
        senCount = 0
        out_file = None
        if out_file_name:
            out_file = open(out_file_name, 'w')
        for sen, _ in sentenceIterator(data):
            senCount+=1
            sentenceFeats = featurizeSentence(sen, self.features)
            for c, tok in enumerate(sen):
                tokFeats = sentenceFeats[c]
                if self.usedFeats:
                    tokFeats = [feat for feat in tokFeats
                                if feat in self.usedFeats]
                if out_file:
                    out_file.write(tok[-1]+'\t'+' '.join(tokFeats)+'\n')
                self.addContext(tokFeats, tok[-1])
            if out_file:
                out_file.write('\n')
            if senCount % 1000 == 0:
                sys.stderr.write(str(senCount)+'...')

        sys.stderr.write(str(senCount)+'...done!\n')
예제 #8
0
    def getEvents(self, data):
        print('featurizing sentences...', end='', file=sys.stderr, flush=True)
        senCount = 0
        tokIndex = -1  # Index starts from 0
        for sen, _ in sentenceIterator(data):
            senCount += 1
            sentenceFeats = featurizeSentence(sen, self._features)
            for c, tok in enumerate(sen):
                tokIndex += 1
                tokFeats = sentenceFeats[c]
                if self._usedFeats:
                    tokFeats = [
                        feat for feat in tokFeats if feat in self._usedFeats
                    ]
                self._addContext(tokFeats, tok[self._tagField], tokIndex)
            self._sentEnd.append(tokIndex)
            if senCount % 1000 == 0:
                print('{0}...'.format(str(senCount)),
                      end='',
                      file=sys.stderr,
                      flush=True)

        self._tokCount = tokIndex + 1
        print('{0}...done!'.format(str(senCount)), file=sys.stderr, flush=True)
예제 #9
0
파일: trainer.py 프로젝트: zbxzc35/HunTag
    def getEvents(self, data, out_file_name):
        sys.stderr.write('featurizing sentences...')
        senCount = 0
        out_file = None
        if out_file_name:
            out_file = open(out_file_name, 'w')
        for sen, _ in sentenceIterator(data):
            senCount += 1
            sentenceFeats = featurizeSentence(sen, self.features)
            for c, tok in enumerate(sen):
                tokFeats = sentenceFeats[c]
                if self.usedFeats:
                    tokFeats = [
                        feat for feat in tokFeats if feat in self.usedFeats
                    ]
                if out_file:
                    out_file.write(tok[-1] + '\t' + ' '.join(tokFeats) + '\n')
                self.addContext(tokFeats, tok[-1])
            if out_file:
                out_file.write('\n')
            if senCount % 1000 == 0:
                sys.stderr.write(str(senCount) + '...')

        sys.stderr.write(str(senCount) + '...done!\n')
예제 #10
0
 def train(self, inputStream):
     for sen, _ in sentenceIterator(inputStream):
         self.obsSequence((tok[self._tagField] for tok in sen))
예제 #11
0
 def train(self, inputStream):
     for sen, _ in sentenceIterator(inputStream):
         self.obsSequence((tok[self._tagField] for tok in sen))