예제 #1
0
 def __init__(self, inFile, outFile):
     self.inFile = inFile
     self.outFile = outFile
     self.normalizer = Normalizer()
     self.tagger = POSTagger(model='resources/postagger.model')
     self.lemmatizer = Lemmatizer()
     self.stemmer = Stemmer()
예제 #2
0
파일: hazm.py 프로젝트: AliKhalili/SHPA.NLP
    def __init__(self, component_config: Dict[Text, Any] = None) -> None:

        super().__init__(component_config)
        if self.component_config.stemmer:
            self._stemmer = Stemmer()

        if self.component_config.lemmatizer:
            self._lemmatizer = Lemmatizer()

        if self.component_config.pos:
            self._pos_tagger = POSTagger(model='resources/postagger.model')
	def nouns(self, texts):
		total_count = len(texts)
		tagger = POSTagger()
		nouns = []
		tagged_doc = tagger.tag_sents(texts)
		for sent in tagged_doc:
			sentence = []
			for word, tag in sent:
				if tag == 'N':
					sentence.append(word)
			nouns.append(sentence)

		return nouns
 def __init__(self, question, useStemmer = False, useSynonyms = False, removeStopwords = False):
     self.question = question
     self.useStemmer = useStemmer
     self.useSynonyms = useSynonyms
     self.removeStopwords = removeStopwords
     self.stopWords = stopwords.words("english")
     self.stem = lambda k : k.lower()
     if self.useStemmer:
         ps = PorterStemmer()
         self.stem = ps.stem
     self.qType = self.determineQuestionType(question)
     self.searchQuery = self.buildSearchQuery(question)
     self.qVector = self.getQueryVector(self.searchQuery)
     self.aType = self.determineAnswerType(question)
     post = POSTagger()
예제 #5
0
def train_pos_tagger(bijankhan_file='resources/bijankhan.txt', path_to_model='resources/persian.tagger', path_to_jar='resources/stanford-postagger.jar', properties_file='resources/persian.tagger.props', memory_min='-Xms1g', memory_max='-Xmx2g', test_split=.1):
	bijankhan = BijankhanReader(bijankhan_file)
	train_file = 'resources/tagger_train_data.txt'
	output = codecs.open(train_file, 'w', 'utf8')
	sentences = list(bijankhan.sents())
	train_part = int(len(sentences) * (1 - test_split))

	for sentence in sentences[:train_part]:
		print(*(map(lambda w: '/'.join(w).replace(' ', '_'), sentence)), file=output)
	cmd = ['java', memory_min, memory_max, '-classpath', path_to_jar, 'edu.stanford.nlp.tagger.maxent.MaxentTagger', '-prop', properties_file, '-model', path_to_model,  '-trainFile', train_file, '-tagSeparator', '/', '-search', 'owlqn2']
	process = subprocess.Popen(cmd)
	process.wait()

	tagger = POSTagger()
	print('\n\n', 'Tagger Accuracy on Test Split:', tagger.evaluate(sentences[train_part:]))
예제 #6
0
 def __init__(self, code, config, **kwargs):
     """ Constructor
     :param code: code
     :type code: str
     :param config: app config
     :type config: dict
     """
     super(HazmEngine, self).__init__(code, config, **kwargs)
     self.code = code
     self.config = config
     self.code = code
     self.oa_transformer = OaLegacyTransformer()
     self.language_codes = ['per', 'fas']
     self.uri = self.config['PARSERS_HAZM_URI']
     self.tagger = POSTagger(model=os.path.join(os.path.dirname(__file__),
                                                'hazm', "postagger.model"))
def worker(identifier, skip, count):
    tagger = POSTagger()
    done = 0
    start = time.time()
    stopwords = load_stopwords()
    documents_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[
        Settings.HAMSHAHRI_DATABASE][Settings.HAMSHAHRI_COLLECTION]
    tags_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[
        Settings.TAGS_DATABASE][Settings.HAMSHAHRI_COLLECTION]

    batch_size = 50
    for batch in range(0, count, batch_size):
        hamshahri_cursor = documents_collection.find().skip(
            skip + batch).limit(batch_size)
        for doc in hamshahri_cursor:
            words = []
            sentences = sent_tokenize(doc['text'])
            sents = []
            for sentence in sentences:
                tokens = word_tokenize(sentence)
                text = [word for word in tokens if word not in stopwords]
                sents.append(text)

            tags = tagger.tag_sents(sents)
            for sent in tags:
                for word, tag in sent:
                    words.append({'word': word, "pos": tag})

            tags_collection.insert({
                "id": doc["id"],
                "categories_fa": doc["categories_fa"],
                "text": doc["text"],
                "words": words
            })

            done += 1
            #if done % 100 == 0:
            end = time.time()
            print 'Worker' + str(identifier) + ': Done ' + str(
                done) + ' out of ' + str(count) + ' in ' + (
                    "%.2f" %
                    (end - start)) + ' sec ~ ' + ("%.2f" %
                                                  (done /
                                                   (end - start))) + '/sec'
            sys.stdout.flush()
예제 #8
0
def hazmtoalpheios(word,uri):
    wordslist = etree.Element("words")
    normalizer = Normalizer()
    item = normalizer.normalize(word)
    analyses = []
    stemmer = Stemmer()
    wordstem = stemmer.stem(item)
    lemmatizer = Lemmatizer()
    wordlema = lemmatizer.lemmatize(item)
    if '#' in wordlema:
        worldleam, garbage = wordlema.split("#")
    tagger = POSTagger(model=os.path.join(model_path,"postagger.model"))
    wordtagged = tagger.tag(word_tokenize(item))
    wordpofs = wordtagged[0][1]
    wordpofs = maptohazm(wordpofs)
    # a better way to do this would be to create a Python class
    # to formalize the abstraction
    analysis = {}
    analysis['engine'] = 'hazm'
    analysis['uri'] = uri
    analysis['form'] = {}
    analysis['form']['text'] = item
    analysis['form']['lang'] = 'per'
    analysis['entries'] = []
    entry = {}
    entry['dict'] = {}
    entry['dict']['hdwd'] = {}
    entry['dict']['hdwd']['lang'] = 'per'
    entry['dict']['hdwd']['text'] = wordstem
    entry['infls'] = []
    infl = {}
    infl['stem'] = {} 
    infl['stem']['text'] = wordstem
    infl['stem']['lang'] = 'per'
    infl['pofs'] = {}
    if wordpofs:
        infl['pofs']['order'] = str(wordpofs[1])
        infl['pofs']['text'] = wordpofs[0]
    entry['infls'].append(infl)
    analysis['entries'].append(entry)
    analyses.append(analysis)
    return analyses
예제 #9
0
def hazmtoalpheiosfile(data,uri):
    root = etree.Element("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF")    
    oaannotation = etree.SubElement(root,'{http://www.w3.org/ns/oa#}Annotation',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':'http://services.projectbamboo.org/morphology'+uri})
    oahasbody = etree.SubElement(oaannotation, '{http://www.w3.org/ns/oa#}hasBody',)
    oahastarget = etree.SubElement(oaannotation,'{http://www.w3.org/ns/oa#}hasTarget')
    hasbodydesc = etree.SubElement(oahastarget,'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri})
    ispartof = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}isPartOf',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri})
    source = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}source',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource':uri})
    title = etree.SubElement(oaannotation, '{http://purl.org/dc/elements/1.1/}title', {'{http://www.w3.org/XML/1998/namespace}lang':'eng'})
    title.text = "Morphology of " + uri
    wordslist = etree.SubElement("words")
    normalizer = Normalizer()
    data = normalizer.normalize(data)
    sentences = sent_tokenize(data)
    words = []
    for sentence in sentences:
        if words:
            words = words.append(word_tokenize(sentence))
        else:
            words = word_tokenize(sentence)
    for item in words:
        stemmer = Stemmer()
        wordstem = stemmer.stem(item)
        lemmatizer = Lemmatizer()
        wordlema = lemmatizer.lemmatize(item)
        if '#' in wordlema:
            worldleam, garbage = wordlema.split("#")
        tagger = POSTagger(model=os.path.join(model_path,"postagger.model"))
        wordtagged = tagger.tag(item)
        wordpofs = wordtagged[0][1]
        word = etree.SubElement(wordslist,'word')
        form = etree.SubElement(word, 'form', {'{http://www.w3.org/XML/1998/namespace}lang':'per'})
        form.text = item
        entry = etree.SubElement(word, 'entry')
        infl = etree.SubElement(entry,'inlf')
        term = etree.SubElement(infl, 'term', {'{http://www.w3.org/XML/1998/namespace}lang':'per'})
        stem = etree.SubElement(term, 'stem')
        stem.text = wordstem
        pofs = etree.SubElement(infl, 'pofs')
        pofs.text = wordpofs
    return root
예제 #10
0
def train_dependency_parser(train_file='resources/train.conll', test_file='resources/test.conll', model_file='langModel.mco', path_to_jar='resources/malt.jar', options_file='resources/options.xml', features_file='resources/features.xml', memory_min='-Xms7g', memory_max='-Xmx8g'):

	def read_conll(conll_file):
		trees = [DependencyGraph(item) for item in dadegan_text(conll_file).replace(' ', '_').split('\n\n') if item.strip()]
		sentences = [[node['word'] for node in tree.nodelist[1:]] for tree in trees]
		return trees, sentences

	lemmatizer, tagger = Lemmatizer(), POSTagger()

	trees, sentences = read_conll(train_file)
	tagged = tagger.batch_tag(sentences)

	train_data = train_file +'.data'
	with codecs.open(train_data, 'w', 'utf8') as output:
		for tree, sentence in zip(trees, tagged):
			for i, (node, word) in enumerate(zip(tree.nodelist[1:], sentence), start=1):
				node['tag'] = word[1]
				node['lemma'] = lemmatizer.lemmatize(node['word'].replace('_', ' '), node['tag'])
				print(i, node['word'].replace(' ', '_'), node['lemma'].replace(' ', '_'), node['tag'], node['tag'], '_', node['head'], node['rel'], '_', '_', sep='\t', file=output)
			print(file=output)

	cmd = ['java', memory_min, memory_max, '-jar', path_to_jar, '-w', 'resources', '-c', model_file, '-i', train_data, '-f', options_file, '-F', features_file, '-m', 'learn']
	process = subprocess.Popen(cmd)
	process.wait()

	# evaluation
	print('\nEvaluating trained model on test data:')
	parser = DependencyParser(tagger=tagger, model_file=model_file)

	trees, sentences = read_conll(test_file)
	tagged = tagger.batch_tag(sentences)
	parsed = parser.tagged_batch_parse(tagged)

	test_data, test_results = test_file +'.data', test_file +'.results'
	print('\n'.join([sentence.to_conll(10) for sentence in trees]).strip(), file=codecs.open(test_data, 'w', 'utf8'))
	print('\n'.join([sentence.to_conll(10) for sentence in parsed]).strip(), file=codecs.open(test_results, 'w', 'utf8'))

	cmd = ['java', '-jar', 'resources/MaltEval.jar', '-g', test_data, '-s', test_results]
	process = subprocess.Popen(cmd)
	process.wait()
예제 #11
0
    def __init__(self,
                 corpus_path='resources/corpus.json',
                 symbols_json_path='resources/symbols.json',
                 persian_lang_path='resources/persian_lang.json',
                 postagger_model_path='resources/postagger.model',
                 max_keyword_num=10, min_keyword_occurrences=0.01, expand_corpus=False):
        self.postagger_model_path = postagger_model_path
        self.symbols_json_path = symbols_json_path
        self.corpus_path = corpus_path
        self.corpus = {}
        self.docs_num = 0
        self.expand_corpus = expand_corpus

        if self.corpus_path is not None:
            with open(corpus_path, encoding='utf-8') as json_file:
                corpus = json.load(json_file)
            self.corpus = corpus['corpus']
            self.docs_num = corpus['docs_num']

        with open(symbols_json_path, encoding='utf-8') as json_file:
            data = json.load(json_file)
        lst = list(data.values())
        self.all_symbols_list = [item for sublist in lst for item in sublist]

        with open(persian_lang_path, encoding='utf-8') as json_file:
            persian_lang = json.load(json_file)

        self.epic_keywords = persian_lang['epic_keywords']
        self.punctuations = persian_lang['punctuations']
        self.persian_alphabet = persian_lang['persian_alphabet']
        self.stop_words = persian_lang['stop_words']

        self.tagger = POSTagger(model=self.postagger_model_path)
        self.normalizer = Normalizer()
        self.max_keyword_num = max_keyword_num
        self.min_keyword_occurrences = min_keyword_occurrences
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

from gensim.summarization.syntactic_unit import SyntacticUnit
from gensim.parsing.preprocessing import preprocess_documents
from gensim.utils import tokenize
from six.moves import xrange
import re
import logging
from hazm import *

logger = logging.getLogger('summa.preprocessing.cleaner')

try:
    #from pattern.en import tag
    from hazm import POSTagger
    tagger = POSTagger(model='resources/postagger.model')
    logger.info(
        "'pattern' package found; tag filters are available for Persian")
    HAS_PATTERN = True
except ImportError:
    #logger.info("'pattern' package not found; tag filters are not available for English")
    logger.info(
        "'pattern' package not found; tag filters are not available for Persian"
    )
    HAS_PATTERN = False

SEPARATOR = r'@'
RE_SENTENCE = re.compile(
    r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)',
    re.UNICODE)  # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)
AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE)
예제 #13
0
# -*- coding: UTF-8 -*-

from hazm import word_tokenize, POSTagger, Stemmer, Chunker, tree2brackets

POSTAGGER_MODEL = 'resources/postagger.model'

tagger = POSTagger(model=POSTAGGER_MODEL)
chunker = Chunker(model='resources/chunker.model')

BLACK_LIST = [
    'RT',
    'برای',
    'این',
]


def is_word_ok(word):
    return len(word) >= 3 and word not in BLACK_LIST


def get_hash_tags(text):
    return set([word for word in text.strip().split() if word.strip().startswith('#')])


def get_names(text):
    tagged_words = tagger.tag(word_tokenize(text))
    words = set(filter(
        lambda word: is_word_ok(word),
        [tagged_word[0] for tagged_word in filter(lambda tagged_word: tagged_word[1] == 'N', tagged_words)]
    ))