def __init__(self, inFile, outFile): self.inFile = inFile self.outFile = outFile self.normalizer = Normalizer() self.tagger = POSTagger(model='resources/postagger.model') self.lemmatizer = Lemmatizer() self.stemmer = Stemmer()
def __init__(self, component_config: Dict[Text, Any] = None) -> None: super().__init__(component_config) if self.component_config.stemmer: self._stemmer = Stemmer() if self.component_config.lemmatizer: self._lemmatizer = Lemmatizer() if self.component_config.pos: self._pos_tagger = POSTagger(model='resources/postagger.model')
def nouns(self, texts): total_count = len(texts) tagger = POSTagger() nouns = [] tagged_doc = tagger.tag_sents(texts) for sent in tagged_doc: sentence = [] for word, tag in sent: if tag == 'N': sentence.append(word) nouns.append(sentence) return nouns
def __init__(self, question, useStemmer = False, useSynonyms = False, removeStopwords = False): self.question = question self.useStemmer = useStemmer self.useSynonyms = useSynonyms self.removeStopwords = removeStopwords self.stopWords = stopwords.words("english") self.stem = lambda k : k.lower() if self.useStemmer: ps = PorterStemmer() self.stem = ps.stem self.qType = self.determineQuestionType(question) self.searchQuery = self.buildSearchQuery(question) self.qVector = self.getQueryVector(self.searchQuery) self.aType = self.determineAnswerType(question) post = POSTagger()
def train_pos_tagger(bijankhan_file='resources/bijankhan.txt', path_to_model='resources/persian.tagger', path_to_jar='resources/stanford-postagger.jar', properties_file='resources/persian.tagger.props', memory_min='-Xms1g', memory_max='-Xmx2g', test_split=.1): bijankhan = BijankhanReader(bijankhan_file) train_file = 'resources/tagger_train_data.txt' output = codecs.open(train_file, 'w', 'utf8') sentences = list(bijankhan.sents()) train_part = int(len(sentences) * (1 - test_split)) for sentence in sentences[:train_part]: print(*(map(lambda w: '/'.join(w).replace(' ', '_'), sentence)), file=output) cmd = ['java', memory_min, memory_max, '-classpath', path_to_jar, 'edu.stanford.nlp.tagger.maxent.MaxentTagger', '-prop', properties_file, '-model', path_to_model, '-trainFile', train_file, '-tagSeparator', '/', '-search', 'owlqn2'] process = subprocess.Popen(cmd) process.wait() tagger = POSTagger() print('\n\n', 'Tagger Accuracy on Test Split:', tagger.evaluate(sentences[train_part:]))
def __init__(self, code, config, **kwargs): """ Constructor :param code: code :type code: str :param config: app config :type config: dict """ super(HazmEngine, self).__init__(code, config, **kwargs) self.code = code self.config = config self.code = code self.oa_transformer = OaLegacyTransformer() self.language_codes = ['per', 'fas'] self.uri = self.config['PARSERS_HAZM_URI'] self.tagger = POSTagger(model=os.path.join(os.path.dirname(__file__), 'hazm', "postagger.model"))
def worker(identifier, skip, count): tagger = POSTagger() done = 0 start = time.time() stopwords = load_stopwords() documents_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[ Settings.HAMSHAHRI_DATABASE][Settings.HAMSHAHRI_COLLECTION] tags_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[ Settings.TAGS_DATABASE][Settings.HAMSHAHRI_COLLECTION] batch_size = 50 for batch in range(0, count, batch_size): hamshahri_cursor = documents_collection.find().skip( skip + batch).limit(batch_size) for doc in hamshahri_cursor: words = [] sentences = sent_tokenize(doc['text']) sents = [] for sentence in sentences: tokens = word_tokenize(sentence) text = [word for word in tokens if word not in stopwords] sents.append(text) tags = tagger.tag_sents(sents) for sent in tags: for word, tag in sent: words.append({'word': word, "pos": tag}) tags_collection.insert({ "id": doc["id"], "categories_fa": doc["categories_fa"], "text": doc["text"], "words": words }) done += 1 #if done % 100 == 0: end = time.time() print 'Worker' + str(identifier) + ': Done ' + str( done) + ' out of ' + str(count) + ' in ' + ( "%.2f" % (end - start)) + ' sec ~ ' + ("%.2f" % (done / (end - start))) + '/sec' sys.stdout.flush()
def hazmtoalpheios(word,uri): wordslist = etree.Element("words") normalizer = Normalizer() item = normalizer.normalize(word) analyses = [] stemmer = Stemmer() wordstem = stemmer.stem(item) lemmatizer = Lemmatizer() wordlema = lemmatizer.lemmatize(item) if '#' in wordlema: worldleam, garbage = wordlema.split("#") tagger = POSTagger(model=os.path.join(model_path,"postagger.model")) wordtagged = tagger.tag(word_tokenize(item)) wordpofs = wordtagged[0][1] wordpofs = maptohazm(wordpofs) # a better way to do this would be to create a Python class # to formalize the abstraction analysis = {} analysis['engine'] = 'hazm' analysis['uri'] = uri analysis['form'] = {} analysis['form']['text'] = item analysis['form']['lang'] = 'per' analysis['entries'] = [] entry = {} entry['dict'] = {} entry['dict']['hdwd'] = {} entry['dict']['hdwd']['lang'] = 'per' entry['dict']['hdwd']['text'] = wordstem entry['infls'] = [] infl = {} infl['stem'] = {} infl['stem']['text'] = wordstem infl['stem']['lang'] = 'per' infl['pofs'] = {} if wordpofs: infl['pofs']['order'] = str(wordpofs[1]) infl['pofs']['text'] = wordpofs[0] entry['infls'].append(infl) analysis['entries'].append(entry) analyses.append(analysis) return analyses
def hazmtoalpheiosfile(data,uri): root = etree.Element("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF") oaannotation = etree.SubElement(root,'{http://www.w3.org/ns/oa#}Annotation',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':'http://services.projectbamboo.org/morphology'+uri}) oahasbody = etree.SubElement(oaannotation, '{http://www.w3.org/ns/oa#}hasBody',) oahastarget = etree.SubElement(oaannotation,'{http://www.w3.org/ns/oa#}hasTarget') hasbodydesc = etree.SubElement(oahastarget,'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri}) ispartof = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}isPartOf',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri}) source = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}source',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource':uri}) title = etree.SubElement(oaannotation, '{http://purl.org/dc/elements/1.1/}title', {'{http://www.w3.org/XML/1998/namespace}lang':'eng'}) title.text = "Morphology of " + uri wordslist = etree.SubElement("words") normalizer = Normalizer() data = normalizer.normalize(data) sentences = sent_tokenize(data) words = [] for sentence in sentences: if words: words = words.append(word_tokenize(sentence)) else: words = word_tokenize(sentence) for item in words: stemmer = Stemmer() wordstem = stemmer.stem(item) lemmatizer = Lemmatizer() wordlema = lemmatizer.lemmatize(item) if '#' in wordlema: worldleam, garbage = wordlema.split("#") tagger = POSTagger(model=os.path.join(model_path,"postagger.model")) wordtagged = tagger.tag(item) wordpofs = wordtagged[0][1] word = etree.SubElement(wordslist,'word') form = etree.SubElement(word, 'form', {'{http://www.w3.org/XML/1998/namespace}lang':'per'}) form.text = item entry = etree.SubElement(word, 'entry') infl = etree.SubElement(entry,'inlf') term = etree.SubElement(infl, 'term', {'{http://www.w3.org/XML/1998/namespace}lang':'per'}) stem = etree.SubElement(term, 'stem') stem.text = wordstem pofs = etree.SubElement(infl, 'pofs') pofs.text = wordpofs return root
def train_dependency_parser(train_file='resources/train.conll', test_file='resources/test.conll', model_file='langModel.mco', path_to_jar='resources/malt.jar', options_file='resources/options.xml', features_file='resources/features.xml', memory_min='-Xms7g', memory_max='-Xmx8g'): def read_conll(conll_file): trees = [DependencyGraph(item) for item in dadegan_text(conll_file).replace(' ', '_').split('\n\n') if item.strip()] sentences = [[node['word'] for node in tree.nodelist[1:]] for tree in trees] return trees, sentences lemmatizer, tagger = Lemmatizer(), POSTagger() trees, sentences = read_conll(train_file) tagged = tagger.batch_tag(sentences) train_data = train_file +'.data' with codecs.open(train_data, 'w', 'utf8') as output: for tree, sentence in zip(trees, tagged): for i, (node, word) in enumerate(zip(tree.nodelist[1:], sentence), start=1): node['tag'] = word[1] node['lemma'] = lemmatizer.lemmatize(node['word'].replace('_', ' '), node['tag']) print(i, node['word'].replace(' ', '_'), node['lemma'].replace(' ', '_'), node['tag'], node['tag'], '_', node['head'], node['rel'], '_', '_', sep='\t', file=output) print(file=output) cmd = ['java', memory_min, memory_max, '-jar', path_to_jar, '-w', 'resources', '-c', model_file, '-i', train_data, '-f', options_file, '-F', features_file, '-m', 'learn'] process = subprocess.Popen(cmd) process.wait() # evaluation print('\nEvaluating trained model on test data:') parser = DependencyParser(tagger=tagger, model_file=model_file) trees, sentences = read_conll(test_file) tagged = tagger.batch_tag(sentences) parsed = parser.tagged_batch_parse(tagged) test_data, test_results = test_file +'.data', test_file +'.results' print('\n'.join([sentence.to_conll(10) for sentence in trees]).strip(), file=codecs.open(test_data, 'w', 'utf8')) print('\n'.join([sentence.to_conll(10) for sentence in parsed]).strip(), file=codecs.open(test_results, 'w', 'utf8')) cmd = ['java', '-jar', 'resources/MaltEval.jar', '-g', test_data, '-s', test_results] process = subprocess.Popen(cmd) process.wait()
def __init__(self, corpus_path='resources/corpus.json', symbols_json_path='resources/symbols.json', persian_lang_path='resources/persian_lang.json', postagger_model_path='resources/postagger.model', max_keyword_num=10, min_keyword_occurrences=0.01, expand_corpus=False): self.postagger_model_path = postagger_model_path self.symbols_json_path = symbols_json_path self.corpus_path = corpus_path self.corpus = {} self.docs_num = 0 self.expand_corpus = expand_corpus if self.corpus_path is not None: with open(corpus_path, encoding='utf-8') as json_file: corpus = json.load(json_file) self.corpus = corpus['corpus'] self.docs_num = corpus['docs_num'] with open(symbols_json_path, encoding='utf-8') as json_file: data = json.load(json_file) lst = list(data.values()) self.all_symbols_list = [item for sublist in lst for item in sublist] with open(persian_lang_path, encoding='utf-8') as json_file: persian_lang = json.load(json_file) self.epic_keywords = persian_lang['epic_keywords'] self.punctuations = persian_lang['punctuations'] self.persian_alphabet = persian_lang['persian_alphabet'] self.stop_words = persian_lang['stop_words'] self.tagger = POSTagger(model=self.postagger_model_path) self.normalizer = Normalizer() self.max_keyword_num = max_keyword_num self.min_keyword_occurrences = min_keyword_occurrences
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html from gensim.summarization.syntactic_unit import SyntacticUnit from gensim.parsing.preprocessing import preprocess_documents from gensim.utils import tokenize from six.moves import xrange import re import logging from hazm import * logger = logging.getLogger('summa.preprocessing.cleaner') try: #from pattern.en import tag from hazm import POSTagger tagger = POSTagger(model='resources/postagger.model') logger.info( "'pattern' package found; tag filters are available for Persian") HAS_PATTERN = True except ImportError: #logger.info("'pattern' package not found; tag filters are not available for English") logger.info( "'pattern' package not found; tag filters are not available for Persian" ) HAS_PATTERN = False SEPARATOR = r'@' RE_SENTENCE = re.compile( r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$) AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE)
# -*- coding: UTF-8 -*- from hazm import word_tokenize, POSTagger, Stemmer, Chunker, tree2brackets POSTAGGER_MODEL = 'resources/postagger.model' tagger = POSTagger(model=POSTAGGER_MODEL) chunker = Chunker(model='resources/chunker.model') BLACK_LIST = [ 'RT', 'برای', 'این', ] def is_word_ok(word): return len(word) >= 3 and word not in BLACK_LIST def get_hash_tags(text): return set([word for word in text.strip().split() if word.strip().startswith('#')]) def get_names(text): tagged_words = tagger.tag(word_tokenize(text)) words = set(filter( lambda word: is_word_ok(word), [tagged_word[0] for tagged_word in filter(lambda tagged_word: tagged_word[1] == 'N', tagged_words)] ))