def __init__(self, debug=False): # create a stemmer object for stemming enclitics and procletics self.comp_stemmer = tashaphyne.stemming.ArabicLightStemmer() # configure the stemmer object self.comp_stemmer.set_infix_letters(snconst.COMP_INFIX_LETTERS) self.comp_stemmer.set_prefix_letters(snconst.COMP_PREFIX_LETTERS) self.comp_stemmer.set_suffix_letters(snconst.COMP_SUFFIX_LETTERS) self.comp_stemmer.set_max_prefix_length(snconst.COMP_MAX_PREFIX) self.comp_stemmer.set_max_suffix_length(snconst.COMP_MAX_SUFFIX) self.comp_stemmer.set_min_stem_length(snconst.COMP_MIN_STEM) self.comp_stemmer.set_prefix_list(snconst.COMP_PREFIX_LIST) self.comp_stemmer.set_suffix_list(snconst.COMP_SUFFIX_LIST) # create a stemmer object for stemming conjugated verb self.conj_stemmer = tashaphyne.stemming.ArabicLightStemmer() # configure the stemmer object self.conj_stemmer.set_infix_letters(snconst.CONJ_INFIX_LETTERS) self.conj_stemmer.set_prefix_letters(snconst.CONJ_PREFIX_LETTERS) self.conj_stemmer.set_suffix_letters(snconst.CONJ_SUFFIX_LETTERS) self.conj_stemmer.set_max_prefix_length(snconst.CONJ_MAX_PREFIX) self.conj_stemmer.set_max_suffix_length(snconst.CONJ_MAX_SUFFIX) self.conj_stemmer.set_min_stem_length(snconst.CONJ_MIN_STEM) self.conj_stemmer.set_prefix_list(snconst.CONJ_PREFIX_LIST) self.conj_stemmer.set_suffix_list(snconst.CONJ_SUFFIX_LIST) #word frequency dictionary self.wordfreq = wordfreqdictionaryclass.WordFreqDictionary( 'wordfreq', wordfreqdictionaryclass.WORDFREQ_DICTIONARY_INDEX) # use the word frequency dictionary as a dictionary for unkonwn words self.noun_dictionary = self.wordfreq self.debug = debug
def __init__(self, allow_tag_guessing=True, allow_disambiguation=True): """ Create Analex instance. """ self.nounstemmer = stem_noun.NounStemmer() # to stem nouns self.verbstemmer = stem_verb.VerbStemmer() # to stem verbs self.unknownstemmer = stem_unknown.UnknownStemmer() # to stem unknown self.stopwordsstemmer = stem_stopwords.StopWordStemmer() # to stem stopwords self.allow_tag_guessing = allow_tag_guessing # allow gueesing tags by naftawayh before analyis # if taggin is disabled, the disambiguation is also disabled self.allow_disambiguation = allow_disambiguation and allow_tag_guessing # allow disambiguation before analyis # enable the last mark (Harakat Al-I3rab) self.allow_syntax_lastmark = True if self.allow_tag_guessing: self.tagger = naftawayh.wordtag.WordTagger() if self.allow_disambiguation: self.disambiguator = disambig.Disambiguator() self.debug = False # to allow to print internal data self.limit = 10000 # limit words in the text self.wordcounter = 0 # the words contain arabic letters and harakat. # the unicode considers arabic harakats as marks not letters, # then we add harakat to the regluar expression to tokenize marks = u"".join(araby.TASHKEEL) # contains [FATHA, DAMMA, KASRA, SUKUN, DAMMATAN, KASRATAN, # FATHATAN, SHADDA]) # used to tokenize arabic text self.token_pat = re.compile(ur"([\w%s]+)" % marks, re.UNICODE) #used to split text into clauses self.clause_pattern = re.compile( r"([\w%s\s]+)" % (ur"".join(araby.TASHKEEL), ), re.UNICODE) # allow partial vocalization support, #~The text is analyzed as partial or fully vocalized. self.partial_vocalization_support = True #word frequency dictionary self.wordfreq = wordfreqdictionaryclass.WordFreqDictionary( 'wordfreq', wordfreqdictionaryclass.WORDFREQ_DICTIONARY_INDEX) # added to avoid duplicated search in the word frequency database # used as cache to reduce database access #added as a global variable to avoid duplucated search #in mutliple call of analex # cache used to avoid duplicata #self.allow_cache_use = True self.allow_cache_use = False self.cache = cache.Cache() # In case of training and vocalized text analysis, # we propose to respect Shadda in the given word self.fully_vocalized_input = False