Пример #1
0
class SequenceTagger(TaggerI):
	""" wrapper for [Wapiti](http://wapiti.limsi.fr) sequence tagger

	>>> tagger = SequenceTagger(patterns=['*', 'u:word-%x[0,0]'])
	>>> tagger.train([[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]])
	>>> tagger.tag_sents([['من', 'به', 'مدرسه', 'رفته_بودم', '.']])
	[[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]]

	>>> tagger.save_model('resources/test.model')
	>>> SequenceTagger(model='resources/test.model').tag_sents([['من', 'به', 'مدرسه', 'رفته_بودم', '.']])
	[[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]]
	"""

	def __init__(self, patterns=[], **options):
		from wapiti import Model
		self.model = Model(patterns='\n'.join(patterns), **options)

	def train(self, sentences):
		self.model.train(['\n'.join([' '.join(word) for word in sentence]) for sentence in sentences])

	def tag(self, tokens):
		return self.tag_sents([tokens])[0]

	def save_model(self, filename):
		self.model.save(filename)

	def tag_sents(self, sentences):
		sentences = list(sentences)
		lines = '\n\n'.join(['\n'.join(sentence) for sentence in sentences]).replace(' ', '_')
		results = self.model.label_sequence(lines).decode('utf8')
		tags = iter(results.strip().split('\n'))
		return [[(word, next(tags)) for word in sentence] for sentence in sentences]
Пример #2
0
class SequenceTagger(TaggerI):
    """ wrapper for [Wapiti](http://wapiti.limsi.fr) sequence tagger

	>>> tagger = SequenceTagger(patterns=['*', 'u:word-%x[0,0]'])
	>>> tagger.train([[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]])
	>>> tagger.tag_sents([['من', 'به', 'مدرسه', 'رفته_بودم', '.']])
	[[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]]

	>>> tagger.save_model('resources/test.model')
	>>> SequenceTagger(model='resources/test.model').tag_sents([['من', 'به', 'مدرسه', 'رفته_بودم', '.']])
	[[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]]
	"""
    def __init__(self, patterns=[], **options):
        from wapiti import Model
        self.model = Model(patterns='\n'.join(patterns), **options)

    def train(self, sentences):
        self.model.train([
            '\n'.join([' '.join(word) for word in sentence])
            for sentence in sentences
        ])

    def save_model(self, filename):
        self.model.save(filename)

    def tag_sents(self, sentences):
        sentences = list(sentences)
        lines = '\n\n'.join(['\n'.join(sentence)
                             for sentence in sentences]).replace(' ', '_')
        results = self.model.label_sequence(lines).decode('utf8')
        tags = iter(results.strip().split('\n'))
        return [[(word, next(tags)) for word in sentence]
                for sentence in sentences]
Пример #3
0
class WapitiPOSTagger(TaggerI):
    """docstring for WapitiPOSTagger"""
    def __init__(self, *args, **kwargs):
        if 'model' not in kwargs:
            kwargs['model'] = 'resources/model.txt'
        if 'pattern' not in kwargs:
            kwargs['pattern'] = 'resources/pattern.txt'
        super(WapitiPOSTagger, self).__init__()

        option_dict = {}
        option_dict['pattern'] = kwargs['pattern']
        option_dict['model'] = kwargs['model']
        self.model = Model(**option_dict)

    def tag_sents(self, sents):
        for words in sents:
            tags = self.model.label_sequence('\n'.join(words)).split('\n')
            yield zip(words, tags)

    def tag(self, sent):
        tags = self.model.label_sequence('\n'.join(sent)).split('\n')
        return zip(sent, tags)
Пример #4
0
class WapitiPOSTagger(TaggerI):
	"""docstring for WapitiPOSTagger"""
	def __init__(self, *args, **kwargs):
		if 'model' not in kwargs:
			kwargs['model'] = 'resources/model.txt'
		if 'pattern' not in kwargs:
			kwargs['pattern'] = 'resources/pattern.txt'
		super(WapitiPOSTagger, self).__init__()

		option_dict = {}
		option_dict['pattern'] = kwargs['pattern']
		option_dict['model'] = kwargs['model']
		self.model = Model(**option_dict)

	def tag_sents(self, sents):
		for words in sents:
			tags = self.model.label_sequence('\n'.join(words)).split('\n')
			yield zip(words, tags)

	def tag(self, sent):
		tags = self.model.label_sequence('\n'.join(sent)).split('\n')
		return zip(sent, tags)
Пример #5
0
class POSTagger():
    def __init__(self,
                 stanford_postagger_model=None,
                 wapiti_postagger_model=None,
                 jar_tagger_path=None,
                 jdk_variable_path="C:/Program Files/Java/jdk1.8.0_121/bin/java.exe",
                 tagging_model="wapiti"):

        import platform
        if platform.system() == "Windows":
            self.tagging_model = "stanford"
        else:
            self.tagging_model = tagging_model

        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/"

        if stanford_postagger_model is None:
            self.stanford_postagger_model = self.dir_path + "resource/postagger/NC_model"
        else:
            self.stanford_postagger_model = stanford_postagger_model

        if jar_tagger_path is None:
            self.jar_tagger_path = self.dir_path + 'resource/postagger/stanford-postagger.jar'
        else:
            self.jar_tagger_path = jar_tagger_path

        if wapiti_postagger_model is None:
            self.wapiti_postagger_model = self.dir_path + "resource/postagger/UPC_full_model_wapiti"
        else:
            self.wapiti_postagger_model = wapiti_postagger_model

        if self.tagging_model == "stanford":
            java_path = jdk_variable_path
            os.environ['JAVAHOME'] = java_path

            self.tagger = StanfordPOSTagger(model_filename=self.stanford_postagger_model,
                                            path_to_jar=self.jar_tagger_path,
                                            encoding='utf-8',
                                            java_options='-mx5000m')
        elif self.tagging_model == "wapiti":
            from wapiti import Model
            self.tagger = Model(model=self.wapiti_postagger_model)

    def is_all_latin(self, word):
        pattern = '[a-zA-Z]*'
        w = re.sub(pattern, '', word)
        if len(w) == 0:
            return True
        else:
            return False

    def parse(self, token_list):
        tagged_tuples = []
        if self.tagging_model == "stanford":
            postags = self.tagger.tag(token_list)
            for element in postags:
                tmp = '_'.join(t for t in element)
                tmp = tmp.strip("_")
                tmp = tmp.split('/')
                tag = tmp[-1]
                tmp = tmp[:-1]
                tmp = '/'.join(i for i in tmp)
                tmp = tmp.strip('/')
                if self.is_all_latin(tmp):
                    tagged_tuples.append((tmp, "FW"))
                else:
                    tagged_tuples.append((tmp, tag))

        elif self.tagging_model == "wapiti":
            sent_line = "\n".join(x for x in token_list)
            postags = self.tagger.label_sequence(sent_line).decode('utf-8')
            postags = postags.strip().split('\n')
            for i, el in enumerate(token_list):
                if self.is_all_latin(el):
                    tagged_tuples.append((el, u"FW"))
                else:
                    tagged_tuples.append((el, postags[i]))
        return tagged_tuples