def __init__(self): self.CuratedList = self.loadCuratedList() self.stop_words = set(stopwords.words('arabic')) self.arStemmer = Analyzer(MorphologyDB.builtin_db()) self.sentSegRegexPattern = self.loadSentSegmentationList() self.DotChar = '_'
def _analyze(db, fin, fout, backoff, cache): if cache: analyzer = Analyzer(db, backoff, cache_size=1024) else: analyzer = Analyzer(db, backoff) line = force_unicode(fin.readline()) while line: if len(line) == 0: line = force_unicode(fin.readline()) continue line = line.strip() tokens = _tokenize(line) for token in tokens: analyses = analyzer.analyze(token) serialized = _serialize_analyses(fout, token, analyses, db.order) if six.PY3: fout.write(serialized) else: fout.write(force_encoding(serialized)) fout.write('\n\n') line = force_unicode(fin.readline())
def __init__(self, db): if not isinstance(db, MorphologyDB): raise ReinflectorError('DB is not an instance of MorphologyDB') if not db.flags.generation: raise ReinflectorError('DB does not support reinflection') self._db = db self._analyzer = Analyzer(db) self._generator = Generator(db)
def pretrained(model_name='msa', top=1, use_gpu=True, batch_size=32, cache_size=10000): """Load a pre-trained model provided with camel_tools. Args: model_name (:obj:`str`, optional): Name of pre-trained model to load. Three models are available: 'msa', 'egy', and 'glf. Defaults to `msa`. top (:obj:`int`, optional): The maximum number of top analyses to return. Defaults to 1. use_gpu (:obj:`bool`, optional): The flag to use a GPU or not. Defaults to True. batch_size (:obj:`int`, optional): The batch size. Defaults to 32. cache_size (:obj:`int`, optional): If greater than zero, then the analyzer will cache the analyses for the cache_size most frequent words, otherwise no analyses will be cached. Defaults to 100000. Returns: :obj:`BERTUnfactoredDisambiguator`: Instance with loaded pre-trained model. """ model_info = CATALOGUE.get_dataset('DisambigBertUnfactored', model_name) model_config = _read_json(Path(model_info.path, 'default_config.json')) model_path = str(model_info.path) features = FEATURE_SET_MAP[model_config['feature']] db = MorphologyDB.builtin_db(model_config['db_name'], 'a') analyzer = Analyzer(db, backoff=model_config['backoff'], cache_size=cache_size) scorer = model_config['scorer'] tie_breaker = model_config['tie_breaker'] ranking_cache = model_config['ranking_cache'] return BERTUnfactoredDisambiguator(model_path, analyzer, top=top, features=features, scorer=scorer, tie_breaker=tie_breaker, use_gpu=use_gpu, batch_size=batch_size, ranking_cache=ranking_cache)
def pretrained_from_config(config, top=1, use_gpu=True, batch_size=32, cache_size=10000): """Load a pre-trained model from a config file. Args: config (:obj:`str`): Config file that defines the model details. Defaults to `None`. top (:obj:`int`, optional): The maximum number of top analyses to return. Defaults to 1. use_gpu (:obj:`bool`, optional): The flag to use a GPU or not. Defaults to True. batch_size (:obj:`int`, optional): The batch size. Defaults to 32. cache_size (:obj:`int`, optional): If greater than zero, then the analyzer will cache the analyses for the cache_size most frequent words, otherwise no analyses will be cached. Defaults to 100000. Returns: :obj:`BERTUnfactoredDisambiguator`: Instance with loaded pre-trained model. """ model_config = _read_json(config) model_path = model_config['model_path'] features = FEATURE_SET_MAP[model_config['feature']] db = MorphologyDB(model_config['db_path'], 'a') analyzer = Analyzer(db, backoff=model_config['backoff'], cache_size=cache_size) scorer = model_config['scorer'] tie_breaker = model_config['tie_breaker'] ranking_cache = model_config['ranking_cache'] return BERTUnfactoredDisambiguator(model_path, analyzer, top=top, features=features, scorer=scorer, tie_breaker=tie_breaker, use_gpu=use_gpu, batch_size=batch_size, ranking_cache=ranking_cache)
def load(lang, nlp=None): # Make sure the language is supported supported = {"en", "ar"} if lang not in supported: raise Exception("%s is an unsupported or unknown language" % lang) if lang == "en": # Load spacy nlp = nlp or spacy.load(lang, disable=["ner"]) # Load language edit merger merger = import_module("errant.%s.merger" % lang) # Load language edit classifier classifier = import_module("errant.%s.classifier" % lang) # The English classifier needs spacy classifier.nlp = nlp # Return a configured ERRANT annotator return Annotator(lang, nlp, merger, classifier) if lang == "ar": # Load spacy # nlp = nlp or spacy.load(lang, disable=["ner"]) db = MorphologyDB.builtin_db() analyzer = Analyzer(db) mled = MLEDisambiguator.pretrained() tagger = DefaultTagger(mled, 'pos') nlp = [analyzer, tagger] # Load language edit merger merger = import_module("errant.%s.merger" % lang) # Load language edit classifier classifier = import_module("errant.%s.classifier" % lang) # The English classifier needs spacy #classifier.nlp = nlp # Return a configured ERRANT annotator return Annotator(lang, nlp, merger, classifier)
def _calima_egy_r13_analyzer(): db = MorphologyDB.builtin_db('calima-egy-r13', 'a') analyzer = Analyzer(db, 'NOAN_PROP') return analyzer
class Reinflector(object): """Morphological reinflector component. Arguments: db (:obj:`~camel_tools.morphology.database.MorphologyDB`): Database to use for generation. Must be opened in reinflection mode or both analysis and generation modes. Raises: :obj:`~camel_tools.morphology.errors.ReinflectorError`: If **db** is not an instance of :obj:`~camel_tools.morphology.database.MorphologyDB` or if **db** does not support reinflection. """ def __init__(self, db): if not isinstance(db, MorphologyDB): raise ReinflectorError('DB is not an instance of MorphologyDB') if not db.flags.generation: raise ReinflectorError('DB does not support reinflection') self._db = db self._analyzer = Analyzer(db) self._generator = Generator(db) def reinflect(self, word, feats): """Generate analyses for a given word from a given set of inflectional features. Arguments: word (:obj:`str`): Word to reinflect. feats (:obj:`dict`): Dictionary of features. See :doc:`/reference/camel_morphology_features` for more information on features and their values. Returns: :obj:`list` of :obj:`dict`: List of generated analyses. See :doc:`/reference/camel_morphology_features` for more information on features and their values. Raises: :obj:`~camel_tools.morphology.errors.InvalidReinflectorFeature`: If a feature is given that is not defined in database. :obj:`~camel_tools.morphology.errors.InvalidReinflectorFeatureValue`: If an invalid value is given to a feature or if 'pos' feature is not defined. """ analyses = self._analyzer.analyze(word) if not analyses or len(analyses) == 0: return [] for feat in feats: if feat not in self._db.defines: raise InvalidReinflectorFeature(feat) elif self._db.defines[feat] is not None: if feat in _ANY_FEATS and feats[feat] == 'ANY': continue elif feats[feat] not in self._db.defines[feat]: raise InvalidReinflectorFeatureValue(feat, feats[feat]) has_clitics = False for feat in _CLITIC_FEATS: if feat in feats: has_clitics = True break results = deque() for analysis in analyses: if dediac_ar(analysis['diac']) != dediac_ar(word): continue if 'pos' in feats and feats['pos'] != analysis['pos']: continue lemma = _LEMMA_SPLIT_RE.split(analysis['lex'])[0] if 'lex' in feats and feats['lex'] != lemma: continue is_valid = True generate_feats = {} for feat in analysis.keys(): if feat in _IGNORED_FEATS: continue elif feat in _SPECIFIED_FEATS and feat not in feats: continue elif has_clitics and feat in _CLITIC_IGNORED_FEATS: continue else: if feat in feats: if feats[feat] == 'ANY': continue elif analysis[feat] != 'na': generate_feats[feat] = feats[feat] else: is_valid = False break elif analysis[feat] != 'na': generate_feats[feat] = analysis[feat] if is_valid: generated = self._generator.generate(lemma, generate_feats) if generated is not None: results.extend(generated) # TODO: Temporary fix to get unique analyses results = [dict(y) for y in set(tuple(x.items()) for x in results)] return list(results)
class TClean(): def __init__(self): self.CuratedList = self.loadCuratedList() self.stop_words = set(stopwords.words('arabic')) self.arStemmer = Analyzer(MorphologyDB.builtin_db()) self.sentSegRegexPattern = self.loadSentSegmentationList() self.DotChar = '_' # # # def loadCuratedList(self): curatedFile = open('../resources/CuratedList.txt', 'r', encoding="utf-8") cList = {} while True: strLine = curatedFile.readline() if not strLine: break strKeyVal = strLine.replace('\n', '').split(":::") self.add_if_key_not_exist(cList, strKeyVal[0], strKeyVal[1]) curatedFile.close() return cList # # # def loadSentSegmentationList(self): sent_segmentationFile = open('../resources/sent_segmentation_list.txt', 'r', encoding="utf-8") delimiterList = [] while True: strLine = sent_segmentationFile.readline() if not strLine: break strLine = ' ' + strLine.replace('\n', '').strip() + ' ' delimiterList.append(strLine) sent_segmentationFile.close() return '(' + '|'.join(map(regEx.escape, delimiterList)) + ')' # # # def getSentTokenization (self, strDoc): return sent_tokenize(strDoc) # # # def getWTokens(self, strTxt): return word_tokenize(strTxt) # # # def getSegSentTokenization(self, strSentence, minSeqSentLen=30): if len(strSentence)<=minSeqSentLen: strSent =[] strSent.append(strSentence) return strSent return regEx.split(self.sentSegRegexPattern, strSentence) # # # def softCleaning (self, strText): # # Remove newline strText = strText.replace('\n', ' ') # # Remove Tashkeel strText = dediac_ar(strText) # # Clean by replacing any matched token with any item in the curated list .. for incorrectToken, correctedToken in self.CuratedList.items(): strText = strText.replace(incorrectToken, correctedToken) # # fix coma and semicolon .. strText = self.replaceWrongComa(strText) # # remove extra spaces strText = regEx.sub(" +", " ", strText) return strText # # # def hardCleaning (self, strText, removeStopWord=False, applyLemmatize=False): # # # Apply soft cleaning first strText = self.softCleaning(strText) # # Normailse strText = normalize_teh_marbuta_ar(strText) # for Alha strText = normalize_alef_ar(strText) # for Alhamza strText = normalize_alef_maksura_ar(strText) # # strText = self.removeNonArabicChar(strText) # # strText = self.lemmatizeAndRemoveDotFromToken(strText, removeStopWord, applyLemmatize) # Remove final sentence-dots #strText = strText.replace('.', ' ') return strText # # # def replaceWrongComa(self, strText): # to keep coma and semicolon strText = strText.replace(",", "،").replace(";", "؛").replace("?", "؟") # # to add space for correct sepration .. strText = strText.replace("،", " ، ").replace("؛ ","؛ ").replace("؟ ","؟ ").replace(":", " : ").replace(".", " . ") return strText # # # def removeNonArabicChar(self, strText): # # remove english and non-arabic (including special) characters strText = regEx.compile('([^\n\u060C-\u064A\.:؟?])').sub(' ', strText) # # remove extra spaces return regEx.sub(" +", " ", strText) # # # def lemmatizeAndRemoveDotFromToken ( self, strDoc, removeStopWord=False, applyLemmatize=False): getTokens = word_tokenize(strDoc) strDoc = "" for strToken in getTokens: # sT= strToken.strip() # # skip if it's a stop word if removeStopWord and sT in self.stop_words: continue # # if applyLemmatize: sT = self.getStemWToken(sT) # # check Dots if '.' in sT and len(sT)>1: sT = sT.replace(".", self.DotChar) # # if len(sT)<2 and '.' not in sT: continue strDoc += sT + ' ' return strDoc.strip() # # # def getStemWToken(self, wToken): # try: stemObject = self.arStemmer.analyze(wToken) # Remove Tashkeel and Normailse strText = dediac_ar(stemObject[0]['stem']) strText = normalize_teh_marbuta_ar(strText) # for Alha strText = normalize_alef_ar(strText) # for Alhamza strText = normalize_alef_maksura_ar(strText) return strText except: return wToken # # # def add_if_key_not_exist(self, dict_obj, key, value): if key not in dict_obj: dict_obj.update({key: value}) # # # def toRound(self, dVal, iDigits =2): return np.round(dVal, iDigits) # # # def readTxtFile (self, strPath): with open(strPath, 'r', encoding="utf-8") as file: return file.read().replace("\n", " ")
if s_size > max_sentence: max_sentence = s_size sentence_size += s_size fd.close() print(min_sentence, max_sentence, sentence_size/len(sentences)) # Extract Morphological properties of every word from corpus db = MorphologyDB.builtin_db() analyzer = Analyzer(db) # # Create analyzer with NOAN_PROP backoff # analyzer = Analyzer(db, 'NOAN_PROP') training_set = [] for sentence in sentences: s = [] for word in sentence: analyses = analyzer.analyze(word['INPUT STRING']) # print(word, analyses) for d in analyses: # print(get_tag(d['bw']) == sentences[0][0]['POS']) tag = get_tag(d['bw'])