示例#1
0
    def __init__(self, debug=False):
        # create a stemmer object for stemming enclitics and procletics
        self.comp_stemmer = tashaphyne.stemming.ArabicLightStemmer()
        # configure the stemmer object
        self.comp_stemmer.set_infix_letters(snconst.COMP_INFIX_LETTERS)
        self.comp_stemmer.set_prefix_letters(snconst.COMP_PREFIX_LETTERS)
        self.comp_stemmer.set_suffix_letters(snconst.COMP_SUFFIX_LETTERS)
        self.comp_stemmer.set_max_prefix_length(snconst.COMP_MAX_PREFIX)
        self.comp_stemmer.set_max_suffix_length(snconst.COMP_MAX_SUFFIX)
        self.comp_stemmer.set_min_stem_length(snconst.COMP_MIN_STEM)
        self.comp_stemmer.set_prefix_list(snconst.COMP_PREFIX_LIST)
        self.comp_stemmer.set_suffix_list(snconst.COMP_SUFFIX_LIST)

        # create a stemmer object for stemming conjugated verb
        self.conj_stemmer = tashaphyne.stemming.ArabicLightStemmer()
        # configure the stemmer object
        self.conj_stemmer.set_infix_letters(snconst.CONJ_INFIX_LETTERS)
        self.conj_stemmer.set_prefix_letters(snconst.CONJ_PREFIX_LETTERS)
        self.conj_stemmer.set_suffix_letters(snconst.CONJ_SUFFIX_LETTERS)
        self.conj_stemmer.set_max_prefix_length(snconst.CONJ_MAX_PREFIX)
        self.conj_stemmer.set_max_suffix_length(snconst.CONJ_MAX_SUFFIX)
        self.conj_stemmer.set_min_stem_length(snconst.CONJ_MIN_STEM)
        self.conj_stemmer.set_prefix_list(snconst.CONJ_PREFIX_LIST)
        self.conj_stemmer.set_suffix_list(snconst.CONJ_SUFFIX_LIST)
        #word frequency dictionary
        self.wordfreq = wordfreqdictionaryclass.WordFreqDictionary(
            'wordfreq', wordfreqdictionaryclass.WORDFREQ_DICTIONARY_INDEX)
        # use the word frequency dictionary as a dictionary for unkonwn words
        self.noun_dictionary = self.wordfreq

        self.debug = debug
示例#2
0
    def __init__(self, allow_tag_guessing=True, allow_disambiguation=True):
        """
        Create Analex instance.
        """

        self.nounstemmer = stem_noun.NounStemmer()  # to stem nouns
        self.verbstemmer = stem_verb.VerbStemmer()  # to stem verbs
        self.unknownstemmer = stem_unknown.UnknownStemmer()
        # to stem unknown
        self.stopwordsstemmer = stem_stopwords.StopWordStemmer()
        # to stem stopwords

        self.allow_tag_guessing = allow_tag_guessing
        # allow gueesing tags by naftawayh before analyis
        # if taggin is disabled, the disambiguation is also disabled
        self.allow_disambiguation = allow_disambiguation and allow_tag_guessing
        # allow disambiguation before analyis
        # enable the last mark (Harakat Al-I3rab)
        self.allow_syntax_lastmark = True
        if self.allow_tag_guessing:
            self.tagger = naftawayh.wordtag.WordTagger()
        if self.allow_disambiguation:
            self.disambiguator = disambig.Disambiguator()
        self.debug = False  # to allow to print internal data
        self.limit = 10000  # limit words in the text
        self.wordcounter = 0
        # the words contain arabic letters and harakat.
        # the unicode considers arabic harakats as marks not letters,
        # then we add harakat to the regluar expression to tokenize
        marks = u"".join(araby.TASHKEEL)
        # contains [FATHA, DAMMA, KASRA, SUKUN, DAMMATAN, KASRATAN,
        #  FATHATAN, SHADDA])
        # used to tokenize arabic text
        self.token_pat = re.compile(ur"([\w%s]+)" % marks, re.UNICODE)
        #used to split text into clauses
        self.clause_pattern = re.compile(
            r"([\w%s\s]+)" % (ur"".join(araby.TASHKEEL), ), re.UNICODE)

        # allow partial vocalization support,
        #~The text is analyzed as partial or fully vocalized.
        self.partial_vocalization_support = True

        #word frequency dictionary
        self.wordfreq = wordfreqdictionaryclass.WordFreqDictionary(
            'wordfreq', wordfreqdictionaryclass.WORDFREQ_DICTIONARY_INDEX)

        # added to avoid duplicated search in the word frequency database
        # used as cache to reduce database access
        #added as a global variable to avoid duplucated search
        #in mutliple call of analex
        # cache used to avoid duplicata
        #self.allow_cache_use = True
        self.allow_cache_use = False
        self.cache = cache.Cache()

        # In case of training and vocalized text analysis,
        # we propose to respect Shadda in the given word
        self.fully_vocalized_input = False