Пример #1
0
def _analyze(db, fin, fout, backoff, cache):
    if cache:
        analyzer = Analyzer(db, backoff, cache_size=1024)
    else:
        analyzer = Analyzer(db, backoff)

    line = force_unicode(fin.readline())

    while line:
        if len(line) == 0:
            line = force_unicode(fin.readline())
            continue

        line = line.strip()
        tokens = _tokenize(line)

        for token in tokens:
            analyses = analyzer.analyze(token)

            serialized = _serialize_analyses(fout, token, analyses, db.order)

            if six.PY3:
                fout.write(serialized)
            else:
                fout.write(force_encoding(serialized))

            fout.write('\n\n')

        line = force_unicode(fin.readline())
Пример #2
0
class Reinflector(object):
    """Morphological reinflector component.

    Arguments:
        db (:obj:`~camel_tools.morphology.database.MorphologyDB`): Database to
            use for generation. Must be opened in reinflection mode or both
            analysis and generation modes.

    Raises:
        :obj:`~camel_tools.morphology.errors.ReinflectorError`: If **db** is
            not an instance of
            :obj:`~camel_tools.morphology.database.MorphologyDB` or if **db**
            does not support reinflection.
    """
    def __init__(self, db):
        if not isinstance(db, MorphologyDB):
            raise ReinflectorError('DB is not an instance of MorphologyDB')
        if not db.flags.generation:
            raise ReinflectorError('DB does not support reinflection')

        self._db = db

        self._analyzer = Analyzer(db)
        self._generator = Generator(db)

    def reinflect(self, word, feats):
        """Generate analyses for a given word from a given set of inflectional
        features.

        Arguments:
            word (:obj:`str`): Word to reinflect.
            feats (:obj:`dict`): Dictionary of features.
                See :doc:`/reference/camel_morphology_features` for more
                information on features and their values.

        Returns:
            :obj:`list` of :obj:`dict`: List of generated analyses.
            See :doc:`/reference/camel_morphology_features` for more
            information on features and their values.

        Raises:
            :obj:`~camel_tools.morphology.errors.InvalidReinflectorFeature`:
                If a feature is given that is not defined in database.
            :obj:`~camel_tools.morphology.errors.InvalidReinflectorFeatureValue`:
                If an invalid value is given to a feature or if 'pos' feature
                is not defined.
        """

        analyses = self._analyzer.analyze(word)

        if not analyses or len(analyses) == 0:
            return []

        for feat in feats:
            if feat not in self._db.defines:
                raise InvalidReinflectorFeature(feat)
            elif self._db.defines[feat] is not None:
                if feat in _ANY_FEATS and feats[feat] == 'ANY':
                    continue
                elif feats[feat] not in self._db.defines[feat]:
                    raise InvalidReinflectorFeatureValue(feat, feats[feat])

        has_clitics = False
        for feat in _CLITIC_FEATS:
            if feat in feats:
                has_clitics = True
                break

        results = deque()

        for analysis in analyses:
            if dediac_ar(analysis['diac']) != dediac_ar(word):
                continue

            if 'pos' in feats and feats['pos'] != analysis['pos']:
                continue

            lemma = _LEMMA_SPLIT_RE.split(analysis['lex'])[0]

            if 'lex' in feats and feats['lex'] != lemma:
                continue

            is_valid = True
            generate_feats = {}

            for feat in analysis.keys():
                if feat in _IGNORED_FEATS:
                    continue
                elif feat in _SPECIFIED_FEATS and feat not in feats:
                    continue
                elif has_clitics and feat in _CLITIC_IGNORED_FEATS:
                    continue
                else:
                    if feat in feats:
                        if feats[feat] == 'ANY':
                            continue
                        elif analysis[feat] != 'na':
                            generate_feats[feat] = feats[feat]
                        else:
                            is_valid = False
                            break
                    elif analysis[feat] != 'na':
                        generate_feats[feat] = analysis[feat]

            if is_valid:
                generated = self._generator.generate(lemma, generate_feats)
                if generated is not None:
                    results.extend(generated)

        # TODO: Temporary fix to get unique analyses
        results = [dict(y) for y in set(tuple(x.items()) for x in results)]

        return list(results)
Пример #3
0
class TClean():


    def __init__(self):
        
        self.CuratedList = self.loadCuratedList()
        self.stop_words = set(stopwords.words('arabic'))
        self.arStemmer = Analyzer(MorphologyDB.builtin_db())
        self.sentSegRegexPattern = self.loadSentSegmentationList()        
        self.DotChar = '_'
        
    #
    #
    #

    def loadCuratedList(self):
          
        curatedFile = open('../resources/CuratedList.txt', 'r', encoding="utf-8") 
        cList = {}
        while True:       
            strLine = curatedFile.readline()             
            if not strLine: 
                break
            strKeyVal = strLine.replace('\n', '').split(":::")
            self.add_if_key_not_exist(cList, strKeyVal[0], strKeyVal[1])  
        curatedFile.close()   
        
        return cList
    
    #
    #
    #
      
    def loadSentSegmentationList(self):      
        sent_segmentationFile = open('../resources/sent_segmentation_list.txt', 'r', encoding="utf-8") 
        delimiterList = []    
        while True:       
            strLine = sent_segmentationFile.readline()             
            if not strLine: 
                break
            strLine = ' ' + strLine.replace('\n', '').strip() + ' '
            delimiterList.append(strLine) 
        sent_segmentationFile.close()
        
        return '(' +  '|'.join(map(regEx.escape, delimiterList)) + ')'         

    #
    #
    #
       
    def getSentTokenization (self, strDoc):
        return sent_tokenize(strDoc)
    #
    #
    #
    def getWTokens(self, strTxt):        
        return word_tokenize(strTxt)
    #
    #
    #
    
    def getSegSentTokenization(self, strSentence, minSeqSentLen=30):
        
        if len(strSentence)<=minSeqSentLen:
            strSent =[]
            strSent.append(strSentence)
            return strSent
        return regEx.split(self.sentSegRegexPattern, strSentence)
    
    #
    #
    #
        
    def softCleaning (self, strText):
        
        #
        # Remove newline
        strText = strText.replace('\n', ' ')
        
        #
        # Remove Tashkeel
        strText = dediac_ar(strText)
        
        #
        # Clean by replacing any matched token with any item in the curated list .. 
        for incorrectToken, correctedToken in self.CuratedList.items():            
            strText = strText.replace(incorrectToken, correctedToken)
        
        #
        # fix coma and semicolon ..
        strText = self.replaceWrongComa(strText)
        
        #
        # remove extra spaces 
        strText = regEx.sub(" +", " ", strText)
        
        return strText

    #
    #
    #
    
    def hardCleaning (self, strText, removeStopWord=False, applyLemmatize=False):
        
        #
        #
        # Apply soft cleaning first
        strText = self.softCleaning(strText)
        
        #
        # Normailse 
        strText = normalize_teh_marbuta_ar(strText)   # for Alha
        strText = normalize_alef_ar(strText)          # for Alhamza
        strText = normalize_alef_maksura_ar(strText) 
        
        #
        #
        strText = self.removeNonArabicChar(strText)
        
        #
        #
        strText = self.lemmatizeAndRemoveDotFromToken(strText, removeStopWord, applyLemmatize)
        
        # Remove final sentence-dots
        #strText = strText.replace('.', ' ')
        return strText

    #
    #
    #
  

    def replaceWrongComa(self, strText):
        
        # to keep coma and semicolon 
        strText = strText.replace(",", "،").replace(";", "؛").replace("?", "؟")
        #
        # to add space for correct sepration ..
        strText = strText.replace("،", " ، ").replace("؛ ","؛ ").replace("؟ ","؟ ").replace(":", " : ").replace(".", " . ")

        return strText
    #
    #
    #
    
    def removeNonArabicChar(self, strText):
        
        # 
        # remove english and non-arabic (including special) characters 
        strText = regEx.compile('([^\n\u060C-\u064A\.:؟?])').sub(' ', strText)
        #
        # remove extra spaces 
        return regEx.sub(" +", " ", strText)
    #
    #
    #

    def lemmatizeAndRemoveDotFromToken (
            self, strDoc,
            removeStopWord=False,
            applyLemmatize=False):
        
        getTokens = word_tokenize(strDoc)        
        strDoc = ""
        
        for strToken in getTokens:
            #
            sT= strToken.strip()
            #
            # skip if it's a stop word
            if removeStopWord and sT in self.stop_words:
                continue
            #
            #
            if applyLemmatize:
                sT = self.getStemWToken(sT)                
            #
            # check Dots
            if '.' in sT and len(sT)>1:
                sT = sT.replace(".", self.DotChar)                 
            #
            #
            if len(sT)<2 and '.' not in sT:
                continue
               
            strDoc += sT + ' ' 
            
        return strDoc.strip()

    #
    #
    #

    def getStemWToken(self, wToken):
        #
        try:            
            stemObject = self.arStemmer.analyze(wToken)
            
            # Remove Tashkeel and Normailse
            strText = dediac_ar(stemObject[0]['stem'])
            strText = normalize_teh_marbuta_ar(strText)   # for Alha
            strText = normalize_alef_ar(strText)          # for Alhamza
            strText = normalize_alef_maksura_ar(strText)         
            return strText
        except:
            return wToken
       
 
        
    #
    #
    #

    def add_if_key_not_exist(self, dict_obj, key, value):
        if key not in dict_obj:
            dict_obj.update({key: value})
    
    #
    #
    #
    
    def toRound(self, dVal, iDigits =2):
        return np.round(dVal, iDigits)

    #
    #
    #

    def readTxtFile (self, strPath):
        with open(strPath, 'r', encoding="utf-8") as file:
            return file.read().replace("\n", " ")
# Extract Morphological properties of every word from corpus

db = MorphologyDB.builtin_db()
analyzer = Analyzer(db)

# # Create analyzer with NOAN_PROP backoff
# analyzer = Analyzer(db, 'NOAN_PROP')

training_set = []

for sentence in sentences:
    s = []
    for word in sentence:
        
        analyses = analyzer.analyze(word['INPUT STRING'])
        # print(word, analyses)
        for d in analyses:
            # print(get_tag(d['bw']) == sentences[0][0]['POS'])
            tag = get_tag(d['bw'])
            if  tag == word['POS']:

                tag_set =[[k,d[k]] for  k in d]
                s.append([word['IS_TRANS'], d])
                break
                # print(d)


        i += 1    

    training_set.append(s)